Atmallen8's group workspace
v2-12B-deduped_o4mu97sj
What makes this group special?
Tags
ip-26-0-152-168-0
Notes
Author
State
Failed
Start time
February 23rd, 2023 2:09:36 PM
Runtime
1m 11s
Tracked hours
-
Run path
eleutherai/pythia/3snrkxe7
OS
Linux-5.15.0-1019-aws-x86_64-with-glibc2.31
Python version
3.9.16
Git repository
git clone https://github.com/EleutherAI/gpt-neox/
Git state
git checkout -b "ip-26-0-152-168-0" 7d16769659a6f308ee924f004b72248979c7bde1
Command
/fsx/gpt-neox/train.py --deepspeed_config {"train_batch_size":1024,"train_micro_batch_size_per_gpu":8,"gradient_accumulation_steps":2,"optimizer":{"type":"Adam","params":{"lr":0.00012,"betas":[0.9,0.95],"eps":1e-08}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":1260000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":1260000000,"contiguous_gradients":true,"cpu_offload":false},"wall_clock_breakdown":true} --megatron_config {"launcher":"slurm","train_batch_size":1024,"train_micro_batch_size_per_gpu":8,"gradient_accumulation_steps":2,"optimizer":{"type":"Adam","params":{"lr":0.00012,"betas":[0.9,0.95],"eps":1e-08}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":1260000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":1260000000,"contiguous_gradients":true,"cpu_offload":false},"wall_clock_breakdown":true,"precision":"fp16","num_layers":36,"hidden_size":5120,"num_attention_heads":40,"seq_length":2048,"max_position_embeddings":2048,"pos_emb":"rotary","no_weight_tying":true,"attention_config":["flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash"],"sparsity_config":{},"scaled_upper_triang_masked_softmax_fusion":true,"bias_gelu_fusion":true,"rotary_pct":0.25,"gpt_j_residual":true,"output_layer_parallelism":"column","lr_decay_style":"cosine","lr_decay_iters":143000,"min_lr":1.2e-05,"optimizer_type":"Adam","zero_stage":1,"zero_reduce_scatter":true,"zero_contiguous_gradients":true,"zero_reduce_bucket_size":1260000000,"zero_allgather_bucket_size":1260000000,"lr":0.00012,"tokenizer_type":"HFTokenizer","train_data_paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"test_data_paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"valid_data_paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"train_data_weights":[1.0],"valid_data_weights":[1.0],"test_data_weights":[1.0],"data_impl":"mmap","save":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","config_files":{"12B-deduped.yml":{"pipe-parallel-size":1,"model-parallel-size":4,"num-layers":36,"hidden-size":5120,"num-attention-heads":40,"seq-length":2048,"max-position-embeddings":2048,"norm":"layernorm","pos-emb":"rotary","rotary_pct":0.25,"no-weight-tying":true,"gpt_j_residual":true,"output_layer_parallelism":"column","attention-config":[[["flash"],36]],"scaled-upper-triang-masked-softmax-fusion":true,"bias-gelu-fusion":true,"optimizer":{"type":"Adam","params":{"lr":0.00012,"betas":[0.9,0.95],"eps":1e-08}},"min_lr":1.2e-05,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":1260000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":1260000000,"contiguous_gradients":true,"cpu_offload":false},"train_micro_batch_size_per_gpu":8,"gradient_accumulation_steps":2,"data-impl":"mmap","checkpoint-activations":true,"checkpoint-num-layers":1,"partition-activations":true,"synchronize-each-layer":true,"gradient_clipping":1.0,"weight-decay":0.1,"hidden-dropout":0,"attention-dropout":0,"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"train-iters":143000,"lr-decay-iters":143000,"distributed-backend":"nccl","lr-decay-style":"cosine","warmup":0.01,"checkpoint-factor":1000,"extra-save-iters":[0,1,2,4,8,16,32,64,128,256,512],"eval-interval":143000,"eval-iters":10,"save":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","load":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","log-interval":10,"steps_per_print":10,"wall_clock_breakdown":true,"log-grad-norm":true,"train-data-paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"valid-data-paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"test-data-paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"tokenizer_type":"HFTokenizer","vocab-file":"/fsx/gpt-neox/20B_tokenizer.json","use_wandb":true,"wandb_team":"eleutherai","wandb_project":"pythia","wandb_group":"v2-12B-deduped","launcher":"slurm","deepspeed_slurm":true}},"load":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","checkpoint_factor":1000,"extra_save_iters":[0,1,2,4,8,16,32,64,128,256,512],"batch_size":8,"train_iters":143000,"eval_iters":10,"eval_interval":143000,"vocab_file":"/fsx/gpt-neox/20B_tokenizer.json","attention_dropout":0,"hidden_dropout":0,"weight_decay":0.1,"checkpoint_activations":true,"synchronize_each_layer":true,"partition_activations":true,"gas":2,"clip_grad":1.0,"dynamic_loss_scale":true,"pipe_parallel_size":1,"model_parallel_size":4,"is_pipe_parallel":true,"use_wandb":true,"wandb_group":"v2-12B-deduped_o4mu97sj","wandb_team":"eleutherai","wandb_project":"pythia","log_interval":10,"log_grad_norm":true,"text_gen_type":"unconditional","deepspeed_slurm":true,"user_script":"/fsx/gpt-neox/train.py","save_iters":[0,1,2,4,8,16,32,64,128,256,512,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000,11000,12000,13000,14000,15000,16000,17000,18000,19000,20000,21000,22000,23000,24000,25000,26000,27000,28000,29000,30000,31000,32000,33000,34000,35000,36000,37000,38000,39000,40000,41000,42000,43000,44000,45000,46000,47000,48000,49000,50000,51000,52000,53000,54000,55000,56000,57000,58000,59000,60000,61000,62000,63000,64000,65000,66000,67000,68000,69000,70000,71000,72000,73000,74000,75000,76000,77000,78000,79000,80000,81000,82000,83000,84000,85000,86000,87000,88000,89000,90000,91000,92000,93000,94000,95000,96000,97000,98000,99000,100000,101000,102000,103000,104000,105000,106000,107000,108000,109000,110000,111000,112000,113000,114000,115000,116000,117000,118000,119000,120000,121000,122000,123000,124000,125000,126000,127000,128000,129000,130000,131000,132000,133000,134000,135000,136000,137000,138000,139000,140000,141000,142000],"global_num_gpus":256}
System Hardware
| CPU count | 96 |
| GPU count | 8 |
| GPU type | NVIDIA A100-SXM4-80GB |
W&B CLI Version
0.10.28
Config
Config parameters are your model's inputs. Learn more
- {} 193 keys▶
- "gelu"
- false
- 1,000
- null
- false
- [] 36 items▶
- 0
- false
- 8
- false
- true
- false
- true
- 1,000
- false
- 1
- "linear"
- false
- 1
- null
- {} 1 key▶
- {} 57 keys▶
- false
- "mmap"
- null
- false
- null
- true
- true
- false
- true
- false
- "nccl"
- null
- null
- null
- false
- true
- false
- 143,000
- 10
- ""
- null
- null
- null
- [] 11 items▶
- false
- {} 8 keys▶
- 1,260,000,000
- true
- 1
46 ... 95▶▶96 ... 145▶▶146 ... 188▶▶
Summary
Summary metrics are your model's outputs. Learn more
No summary metrics saved for this run.
Check the summary metrics documentation for more information.