Chilli's group workspace
2wCxUroswJk3JQ52GEkeB3_1jzsyf77
What makes this group special?
Tags
ip-26-0-148-159-0
Notes
Author
State
Crashed
Start time
August 3rd, 2023 8:00:07 AM
Runtime
1m 55s
Tracked hours
-
Run path
eleutherai/neox/284ucy6b
OS
Linux-5.15.0-1037-aws-x86_64-with-glibc2.17
Python version
3.8.17
Command
/fsx/lintangsutawika/01-project-pythia/gpt-neox/train.py --deepspeed_config {"train_batch_size":1024,"train_micro_batch_size_per_gpu":16,"optimizer":{"type":"Adam","params":{"lr":0.00016,"betas":[0.9,0.95],"eps":1.6e-05}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":500000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":500000000,"contiguous_gradients":true,"cpu_offload":false},"steps_per_print":1,"wall_clock_breakdown":true} --megatron_config {"launcher":"slurm","train_batch_size":1024,"train_micro_batch_size_per_gpu":16,"optimizer":{"type":"Adam","params":{"lr":0.00016,"betas":[0.9,0.95],"eps":1.6e-05}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":500000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":500000000,"contiguous_gradients":true,"cpu_offload":false},"steps_per_print":1,"wall_clock_breakdown":true,"precision":"fp16","num_layers":32,"hidden_size":2560,"num_attention_heads":32,"seq_length":2048,"max_position_embeddings":2048,"pos_emb":"rotary","no_weight_tying":true,"attention_config":["flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash"],"sparsity_config":{},"scaled_upper_triang_masked_softmax_fusion":true,"bias_gelu_fusion":true,"rotary_pct":0.25,"init_method":"small_init","output_layer_init_method":"wang_init","gpt_j_residual":true,"output_layer_parallelism":"column","lr_decay_style":"cosine","lr_decay_iters":143000,"min_lr":1.6e-05,"optimizer_type":"Adam","zero_stage":1,"zero_reduce_scatter":true,"zero_contiguous_gradients":true,"zero_reduce_bucket_size":500000000,"zero_allgather_bucket_size":500000000,"lr":0.00016,"tokenizer_type":"HFTokenizer","train_data_paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"test_data_paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"valid_data_paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"train_data_weights":[1.0],"valid_data_weights":[1.0],"test_data_weights":[1.0],"data_impl":"mmap","save":"/fsx/lintangsutawika/checkpoints/temp_neox_models","config_files":{"2-8B.yml":{"vocab-file":"/fsx/lintangsutawika/01-project-pythia/20B_tokenizer.json","train-data-paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"valid-data-paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"test-data-paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"pipe_parallel_size":1,"model_parallel_size":1,"num_layers":32,"hidden_size":2560,"num_attention_heads":32,"seq_length":2048,"max_position_embeddings":2048,"pos_emb":"rotary","rotary_pct":0.25,"no_weight_tying":true,"gpt_j_residual":true,"output_layer_parallelism":"column","attention_config":[[["flash"],32]],"scaled_upper_triang_masked_softmax_fusion":true,"bias_gelu_fusion":true,"init_method":"small_init","output_layer_init_method":"wang_init","optimizer":{"type":"Adam","params":{"lr":0.00016,"betas":[0.9,0.95],"eps":1.6e-05}},"min_lr":1.6e-05,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":500000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":500000000,"contiguous_gradients":true,"cpu_offload":false},"train_micro_batch_size_per_gpu":16,"gas":1,"data_impl":"mmap","num_workers":1,"checkpoint_activations":true,"checkpoint_num_layers":1,"partition_activations":true,"synchronize_each_layer":true,"gradient_clipping":1.0,"weight_decay":0.1,"hidden_dropout":0,"attention_dropout":0,"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"train-iters":143000,"lr-decay-iters":143000,"distributed-backend":"nccl","lr-decay-style":"cosine","warmup":0.01,"checkpoint-factor":10000,"log_interval":1,"steps_per_print":1,"wall_clock_breakdown":true,"tokenizer_type":"HFTokenizer","launcher":"slurm","deepspeed_slurm":true},"save-per-step_envloaded.yml":{"save":"/fsx/lintangsutawika/checkpoints/temp_neox_models","load":"/fsx/lintangsutawika/checkpoints/temp_neox_models","extra-save-iters":[38004,38005,38006],"exit_interval":38006}},"load":"/fsx/lintangsutawika/checkpoints/temp_neox_models","checkpoint_factor":10000,"extra_save_iters":[38004,38005,38006],"batch_size":16,"train_iters":143000,"vocab_file":"/fsx/lintangsutawika/01-project-pythia/20B_tokenizer.json","num_workers":1,"exit_interval":38006,"attention_dropout":0,"hidden_dropout":0,"weight_decay":0.1,"checkpoint_activations":true,"synchronize_each_layer":true,"partition_activations":true,"gas":1,"clip_grad":1.0,"dynamic_loss_scale":true,"pipe_parallel_size":1,"is_pipe_parallel":true,"wandb_group":"2wCxUroswJk3JQ52GEkeB3_1jzsyf77","log_interval":1,"text_gen_type":"unconditional","deepspeed_slurm":true,"user_script":"/fsx/lintangsutawika/01-project-pythia/gpt-neox/train.py","save_iters":[10000,20000,30000,38004,38005,38006,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,140000],"global_num_gpus":64}
System Hardware
CPU count | 96 |
GPU count | 8 |
GPU type | NVIDIA A100-SXM4-40GB |
W&B CLI Version
0.10.28
Config
Config parameters are your model's inputs. Learn more
- {} 193 keys▶
- "gelu"
- false
- 1,000
- null
- false
- [] 32 items▶
- 0
- false
- 16
- false
- true
- false
- true
- 10,000
- false
- 1
- "linear"
- false
- 1
- null
- {} 2 keys▶
- {} 49 keys▶
- {} 4 keys▶
- 38,006
- [] 3 items▶
- 38,004
- 38,005
- 38,006
- "/fsx/lintangsutawika/checkpoints/temp_neox_models"
- "/fsx/lintangsutawika/checkpoints/temp_neox_models"
- false
- "mmap"
- null
- false
- null
- true
- true
- false
- true
- false
- "nccl"
- null
- null
- null
- false
- true
- false
- 1,000
- 100
- ""
- null
- null
- 38,006
- [] 3 items▶
- 38,004
- 38,005
- 38,006
- false
- {} 8 keys▶
- 500,000,000
- true
- 1
46 ... 95▶▶96 ... 145▶▶146 ... 188▶▶
Summary
Summary metrics are your model's outputs. Learn more
- {} 27 keys▶
- 153,581,600,317,638.1
- 5.700433969497681
- 179.63544626238945
- 0.29397010803222656
- 0.011205673217773438
- 281.66818618774414
- 0.05793571472167969
- 3,725.6929874420166
- 0.009059906005859377
- 0.027179718017578125
- 3,725.6131172180176
- 3,725.6367206573486
- 3,725.715160369873
- 16.219139099121094
- 504.2617321014404
- 1,297.5029945373535
- 65.49632803353117
- 8.864737897565846
- 22.80963086361395
- 4.959407563088256
- 318.16768646240234
- 0.02694129943847656
- 282.11092948913574
- 0
- 0.00000134265734265734
- 9.554676055908203
- 4,096
Artifact Outputs
This run produced these artifacts as outputs. Learn more
Type
Name
Consumer count
Loading...