Chilli's group workspace
4N99WmSaVJpjAKR3NzjBWx_36xlgurp
What makes this group special?
Tags
ip-26-0-153-77-0
Notes
Author
State
Finished
Start time
August 3rd, 2023 7:45:44 AM
Runtime
3m 57s
Tracked hours
-
Run path
eleutherai/neox/orhsl4nv
OS
Linux-5.15.0-1037-aws-x86_64-with-glibc2.17
Python version
3.8.17
Command
/fsx/lintangsutawika/01-project-pythia/gpt-neox/train.py --deepspeed_config {"train_batch_size":1024,"train_micro_batch_size_per_gpu":16,"optimizer":{"type":"Adam","params":{"lr":0.00016,"betas":[0.9,0.95],"eps":1.6e-05}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":500000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":500000000,"contiguous_gradients":true,"cpu_offload":false},"steps_per_print":1,"wall_clock_breakdown":true} --megatron_config {"launcher":"slurm","train_batch_size":1024,"train_micro_batch_size_per_gpu":16,"optimizer":{"type":"Adam","params":{"lr":0.00016,"betas":[0.9,0.95],"eps":1.6e-05}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":500000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":500000000,"contiguous_gradients":true,"cpu_offload":false},"steps_per_print":1,"wall_clock_breakdown":true,"precision":"fp16","num_layers":32,"hidden_size":2560,"num_attention_heads":32,"seq_length":2048,"max_position_embeddings":2048,"pos_emb":"rotary","no_weight_tying":true,"attention_config":["flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash"],"sparsity_config":{},"scaled_upper_triang_masked_softmax_fusion":true,"bias_gelu_fusion":true,"rotary_pct":0.25,"init_method":"small_init","output_layer_init_method":"wang_init","gpt_j_residual":true,"output_layer_parallelism":"column","lr_decay_style":"cosine","lr_decay_iters":143000,"min_lr":1.6e-05,"optimizer_type":"Adam","zero_stage":1,"zero_reduce_scatter":true,"zero_contiguous_gradients":true,"zero_reduce_bucket_size":500000000,"zero_allgather_bucket_size":500000000,"lr":0.00016,"tokenizer_type":"HFTokenizer","train_data_paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"test_data_paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"valid_data_paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"train_data_weights":[1.0],"valid_data_weights":[1.0],"test_data_weights":[1.0],"data_impl":"mmap","save":"/fsx/lintangsutawika/checkpoints/temp_neox_models","config_files":{"2-8B.yml":{"vocab-file":"/fsx/lintangsutawika/01-project-pythia/20B_tokenizer.json","train-data-paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"valid-data-paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"test-data-paths":["/fsx/pile/pile_20B_tokenizer_text_document"],"pipe_parallel_size":1,"model_parallel_size":1,"num_layers":32,"hidden_size":2560,"num_attention_heads":32,"seq_length":2048,"max_position_embeddings":2048,"pos_emb":"rotary","rotary_pct":0.25,"no_weight_tying":true,"gpt_j_residual":true,"output_layer_parallelism":"column","attention_config":[[["flash"],32]],"scaled_upper_triang_masked_softmax_fusion":true,"bias_gelu_fusion":true,"init_method":"small_init","output_layer_init_method":"wang_init","optimizer":{"type":"Adam","params":{"lr":0.00016,"betas":[0.9,0.95],"eps":1.6e-05}},"min_lr":1.6e-05,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":500000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":500000000,"contiguous_gradients":true,"cpu_offload":false},"train_micro_batch_size_per_gpu":16,"gas":1,"data_impl":"mmap","num_workers":1,"checkpoint_activations":true,"checkpoint_num_layers":1,"partition_activations":true,"synchronize_each_layer":true,"gradient_clipping":1.0,"weight_decay":0.1,"hidden_dropout":0,"attention_dropout":0,"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"train-iters":143000,"lr-decay-iters":143000,"distributed-backend":"nccl","lr-decay-style":"cosine","warmup":0.01,"checkpoint-factor":10000,"log_interval":1,"steps_per_print":1,"wall_clock_breakdown":true,"tokenizer_type":"HFTokenizer","launcher":"slurm","deepspeed_slurm":true},"save-per-step_envloaded.yml":{"save":"/fsx/lintangsutawika/checkpoints/temp_neox_models","load":"/fsx/lintangsutawika/checkpoints/temp_neox_models","extra-save-iters":[38000,38001,38002],"exit_interval":38002}},"load":"/fsx/lintangsutawika/checkpoints/temp_neox_models","checkpoint_factor":10000,"extra_save_iters":[38000,38001,38002],"batch_size":16,"train_iters":143000,"vocab_file":"/fsx/lintangsutawika/01-project-pythia/20B_tokenizer.json","num_workers":1,"exit_interval":38002,"attention_dropout":0,"hidden_dropout":0,"weight_decay":0.1,"checkpoint_activations":true,"synchronize_each_layer":true,"partition_activations":true,"gas":1,"clip_grad":1.0,"dynamic_loss_scale":true,"pipe_parallel_size":1,"is_pipe_parallel":true,"wandb_group":"4N99WmSaVJpjAKR3NzjBWx_36xlgurp","log_interval":1,"text_gen_type":"unconditional","deepspeed_slurm":true,"user_script":"/fsx/lintangsutawika/01-project-pythia/gpt-neox/train.py","save_iters":[10000,20000,30000,38000,38001,38002,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,140000],"global_num_gpus":64}
System Hardware
CPU count | 96 |
GPU count | 8 |
GPU type | NVIDIA A100-SXM4-40GB |
W&B CLI Version
0.10.28
Config
Config parameters are your model's inputs. Learn more
- {} 193 keys▶
- "gelu"
- false
- 1,000
- null
- false
- [] 32 items▶
- 0
- false
- 16
- false
- true
- false
- true
- 10,000
- false
- 1
- "linear"
- false
- 1
- null
- {} 2 keys▶
- {} 49 keys▶
- {} 4 keys▶
- 38,002
- [] 3 items▶
- 38,000
- 38,001
- 38,002
- "/fsx/lintangsutawika/checkpoints/temp_neox_models"
- "/fsx/lintangsutawika/checkpoints/temp_neox_models"
- false
- "mmap"
- null
- false
- null
- true
- true
- false
- true
- false
- "nccl"
- null
- null
- null
- false
- true
- false
- 1,000
- 100
- ""
- null
- null
- 38,002
- [] 3 items▶
- 38,000
- 38,001
- 38,002
- false
- {} 8 keys▶
- 500,000,000
- true
- 1
46 ... 95▶▶96 ... 145▶▶146 ... 188▶▶
Summary
Summary metrics are your model's outputs. Learn more
No summary metrics saved for this run.
Check the summary metrics documentation for more information.