Skip to main content

Borisd13's workspace

Charts
1 of 1
meta
["--assert_TPU_available","--output_dir","gs://craiyon_training_us_east1/cappa/20240629192029","--config_name","craiyon/cappa-jax/config-1ttqnnzh:v0","--loss_type","sigmoid","--freeze_vision","--unroll","100","--tokenizer_name","./cappa_tokenizer","--train_folder","./datacomp1b_train.pkl","--valid_folder","./datacomp1b_valid.pkl","--image_crop_resize","256","--key_caption","caption_normalized","--do_train","--do_eval","--dtype","bfloat16","--float32_logits","--learning_rate","0.0001","--warmup_steps","20000","--lr_offset","0","--lr_decay","cosine","--lr_transition_steps","200000","--batch_size_per_node","896","--gradient_accumulation","1","--weight_decay","0.0","--optim","distributed_shampoo","--beta1","0.9","--beta2","0.99","--preconditioning_compute_steps","100","--block_size_text","1024","--block_size_vision","1024","--nesterov","--graft_type","rmsprop_normalized","--mp_devices","1","--shard_shampoo_across","2d","--activation_partitioning_dims","1","--parameter_partitioning_dims","1","--logging_steps","100","--eval_steps","5000","--save_steps","20000","--wandb_entity","craiyon","--wandb_project","cappa-jax"]
[]
["--tokenizer_name","cappa_tokenizer","--train_run","craiyon/cappa-jax/ydqtfo4c","--normalize","--fraction","0.05"]
["--assert_TPU_available","--output_dir","gs://craiyon_training_europe_west4//cappa/20240626014812","--config_name","craiyon/cappa-jax/config-qif6xxft:v47","--restore_state","--unroll","100","--tokenizer_name","./cappa_tokenizer","--train_folder","./datacomp1b_train.pkl","--valid_folder","./datacomp1b_valid.pkl","--image_crop_resize","256","--key_caption","caption_normalized","--do_train","--do_eval","--n_predict","128","--n_predict_batch","8","--predict_num_beams","1","--dtype","bfloat16","--float32_logits","--remat_policy","none","--learning_rate","0.0001","--warmup_steps","0","--lr_offset","0","--lr_decay","cosine","--lr_transition_steps","500000","--batch_size_per_node","128","--gradient_accumulation","1","--weight_decay","0.0","--optim","distributed_shampoo","--beta1","0.9","--beta2","0.99","--preconditioning_compute_steps","100","--block_size_text","1024","--block_size_vision","1024","--nesterov","--graft_type","rmsprop_normalized","--mp_devices","1","--shard_shampoo_across","2d","--activation_partitioning_dims","2","--parameter_partitioning_dims","2","--logging_steps","100","--eval_steps","5000","--save_steps","20000","--wandb_entity","craiyon","--wandb_project","cappa-jax"]
["--tokenizer_name","cappa_tokenizer","--train_run","craiyon/cappa-jax/qif6xxft","--normalize","--fraction","0.05"]
["--assert_TPU_available","--output_dir","gs://craiyon_training_europe_west4//cappa/20240620224002","--config_name","../configs/large-patch16-cappa.json","--unroll","100","--tokenizer_name","./cappa_tokenizer","--train_folder","./datacomp1b_train.pkl","--valid_folder","./datacomp1b_valid.pkl","--image_crop_resize","256","--key_caption","caption_normalized","--do_train","--do_eval","--n_predict","128","--n_predict_batch","8","--predict_num_beams","1","--dtype","bfloat16","--float32_logits","--remat_policy","none","--learning_rate","0.0001","--warmup_steps","20000","--lr_offset","0","--batch_size_per_node","128","--gradient_accumulation","1","--weight_decay","0.0","--optim","distributed_shampoo","--beta1","0.9","--beta2","0.99","--preconditioning_compute_steps","100","--block_size_text","1024","--block_size_vision","1024","--nesterov","--graft_type","rmsprop_normalized","--mp_devices","1","--shard_shampoo_across","2d","--activation_partitioning_dims","2","--parameter_partitioning_dims","2","--logging_steps","100","--eval_steps","5000","--save_steps","20000","--wandb_entity","craiyon","--wandb_project","cappa-jax"]
"training/train.py"
"utils/run_benchmark_cappa.py"
"training/train.py"
"utils/run_benchmark_cappa.py"
"training/train.py"
24
48
48
56
48
56
48
96
96
112
96
112
{"remote":"https://github.com/borisdayma/clip-jax.git","commit":"78ecc4bb4930e117a44eb9b9de62e57bc1afca51","__typename":"GitInfo"}
{"remote":"https://github.com/borisdayma/clip-jax.git","commit":"78ecc4bb4930e117a44eb9b9de62e57bc1afca51","__typename":"GitInfo"}
{"remote":"https://github.com/borisdayma/clip-jax.git","commit":"aceb2f7a83488439b9e62f73330c2f32360455ea","__typename":"GitInfo"}
{"remote":"https://github.com/borisdayma/clip-jax.git","commit":"aceb2f7a83488439b9e62f73330c2f32360455ea","__typename":"GitInfo"}
{"remote":"https://github.com/borisdayma/clip-jax.git","commit":"aceb2f7a83488439b9e62f73330c2f32360455ea","__typename":"GitInfo"}
{"remote":"https://github.com/borisdayma/clip-jax.git","commit":"4ff62a6fecf92977dfd253d8b8a6d3538a5c0201","__typename":"GitInfo"}
dalle-pod
v5-pod
v5-pod
Linux-5.19.0-1022-gcp-x86_64-with-glibc2.35
Linux-5.13.0-1027-gcp-x86_64-with-glibc2.31
Linux-5.13.0-1027-gcp-x86_64-with-glibc2.31
Linux-5.19.0-1030-gcp-x86_64-with-glibc2.35
Linux-5.13.0-1027-gcp-x86_64-with-glibc2.31
Linux-5.19.0-1030-gcp-x86_64-with-glibc2.35
/home/boris/clip-jax/training/train.py
<python with no main file>
/home/boris/clip-jax/utils/run_benchmark_cappa.py
/home/boris/clip-jax/training/train.py
/home/boris/clip-jax/utils/run_benchmark_cappa.py
/home/boris/clip-jax/training/train.py
3.10.14
3.10.12
3.10.12
3.10.14
3.10.12
3.10.14
2d 23h 49m 25s
8s
3d 10h 28m 17s
2d 16h 18m 10s
13h 46m 51s
5d 3h 7m 39s
config
model_config
text_config
["gelu","linear"]
-
-
["gelu","linear"]
-
["gelu","linear"]
0
-
-
0
-
0
1
-
-
1
-
1
bfloat16
-
-
bfloat16
-
bfloat16
2
-
-
2
-
2
params_norm
2 of 37
gradients_norm
2 of 37