Skip to main content

Atmallen8's group workspace

v2-12B-deduped_2rfw9acn

What makes this group special?
Tags

ip-26-0-137-222-0

Notes
State
Crashed
Start time
February 24th, 2023 2:39:17 PM
Runtime
31s
Tracked hours
-
Run path
eleutherai/pythia/i6mo0xtp
OS
Linux-5.15.0-1023-aws-x86_64-with-glibc2.31
Python version
3.9.15
Git repository
git clone https://github.com/EleutherAI/gpt-neox
Git state
git checkout -b "ip-26-0-137-222-0" d36f623fa2634f7824eaa83671f81f4c5773e120
Command
/fsx/hailey/gpt-neox/train.py --deepspeed_config {"train_batch_size":1024,"train_micro_batch_size_per_gpu":1,"gradient_accumulation_steps":16,"optimizer":{"type":"Adam","params":{"lr":0.00012,"betas":[0.9,0.95],"eps":1e-08}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":1260000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":1260000000,"contiguous_gradients":true,"cpu_offload":false},"wall_clock_breakdown":true} --megatron_config {"launcher":"slurm","train_batch_size":1024,"train_micro_batch_size_per_gpu":1,"gradient_accumulation_steps":16,"optimizer":{"type":"Adam","params":{"lr":0.00012,"betas":[0.9,0.95],"eps":1e-08}},"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"gradient_clipping":1.0,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":1260000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":1260000000,"contiguous_gradients":true,"cpu_offload":false},"wall_clock_breakdown":true,"precision":"fp16","num_layers":36,"hidden_size":5120,"num_attention_heads":40,"seq_length":2048,"max_position_embeddings":2048,"pos_emb":"rotary","no_weight_tying":true,"attention_config":["flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash","flash"],"sparsity_config":{},"scaled_upper_triang_masked_softmax_fusion":true,"bias_gelu_fusion":true,"rotary_pct":0.25,"gpt_j_residual":true,"output_layer_parallelism":"column","lr_decay_style":"cosine","lr_decay_iters":143000,"min_lr":1.2e-05,"optimizer_type":"Adam","zero_stage":1,"zero_reduce_scatter":true,"zero_contiguous_gradients":true,"zero_reduce_bucket_size":1260000000,"zero_allgather_bucket_size":1260000000,"lr":0.00012,"tokenizer_type":"HFTokenizer","train_data_paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"test_data_paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"valid_data_paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"train_data_weights":[1.0],"valid_data_weights":[1.0],"test_data_weights":[1.0],"data_impl":"mmap","save":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","config_files":{"12B-deduped.yml":{"pipe-parallel-size":1,"model-parallel-size":4,"num-layers":36,"hidden-size":5120,"num-attention-heads":40,"seq-length":2048,"max-position-embeddings":2048,"norm":"layernorm","pos-emb":"rotary","rotary_pct":0.25,"no-weight-tying":true,"gpt_j_residual":true,"output_layer_parallelism":"column","attention-config":[[["flash"],36]],"scaled-upper-triang-masked-softmax-fusion":true,"bias-gelu-fusion":true,"optimizer":{"type":"Adam","params":{"lr":0.00012,"betas":[0.9,0.95],"eps":1e-08}},"min_lr":1.2e-05,"zero_optimization":{"stage":1,"allgather_partitions":true,"allgather_bucket_size":1260000000,"overlap_comm":true,"reduce_scatter":true,"reduce_bucket_size":1260000000,"contiguous_gradients":true,"cpu_offload":false},"train_micro_batch_size_per_gpu":1,"gradient_accumulation_steps":16,"data-impl":"mmap","checkpoint-activations":true,"checkpoint-num-layers":1,"partition-activations":true,"synchronize-each-layer":true,"gradient_clipping":1.0,"weight-decay":0.1,"hidden-dropout":0,"attention-dropout":0,"fp16":{"fp16":true,"enabled":true,"loss_scale":0,"loss_scale_window":1000,"initial_scale_power":12,"hysteresis":2,"min_loss_scale":1},"train-iters":143000,"lr-decay-iters":143000,"distributed-backend":"nccl","lr-decay-style":"cosine","warmup":0.01,"checkpoint-factor":1000,"extra-save-iters":[0,1,2,4,8,16,32,64,128,256,512],"eval-interval":143000,"eval-iters":10,"save":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","load":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","log-interval":10,"steps_per_print":10,"wall_clock_breakdown":true,"train-data-paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"valid-data-paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"test-data-paths":["/fsx/pile_deduped/pile_0.87_deduped_text_document"],"tokenizer_type":"HFTokenizer","vocab-file":"/fsx/pile/20B_tokenizer.json","use_wandb":true,"wandb_team":"eleutherai","wandb_project":"pythia","wandb_group":"v2-12B-deduped","launcher":"slurm","deepspeed_slurm":true}},"load":"/fsx/hailey/pythia/new_ckpts/pythia-v2-12b-deduped","checkpoint_factor":1000,"extra_save_iters":[0,1,2,4,8,16,32,64,128,256,512],"batch_size":1,"train_iters":143000,"eval_iters":10,"eval_interval":143000,"vocab_file":"/fsx/pile/20B_tokenizer.json","attention_dropout":0,"hidden_dropout":0,"weight_decay":0.1,"checkpoint_activations":true,"synchronize_each_layer":true,"partition_activations":true,"gas":16,"clip_grad":1.0,"dynamic_loss_scale":true,"pipe_parallel_size":1,"model_parallel_size":4,"is_pipe_parallel":true,"use_wandb":true,"wandb_group":"v2-12B-deduped_2rfw9acn","wandb_team":"eleutherai","wandb_project":"pythia","log_interval":10,"text_gen_type":"unconditional","deepspeed_slurm":true,"user_script":"/fsx/hailey/gpt-neox/train.py","save_iters":[0,1,2,4,8,16,32,64,128,256,512,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000,11000,12000,13000,14000,15000,16000,17000,18000,19000,20000,21000,22000,23000,24000,25000,26000,27000,28000,29000,30000,31000,32000,33000,34000,35000,36000,37000,38000,39000,40000,41000,42000,43000,44000,45000,46000,47000,48000,49000,50000,51000,52000,53000,54000,55000,56000,57000,58000,59000,60000,61000,62000,63000,64000,65000,66000,67000,68000,69000,70000,71000,72000,73000,74000,75000,76000,77000,78000,79000,80000,81000,82000,83000,84000,85000,86000,87000,88000,89000,90000,91000,92000,93000,94000,95000,96000,97000,98000,99000,100000,101000,102000,103000,104000,105000,106000,107000,108000,109000,110000,111000,112000,113000,114000,115000,116000,117000,118000,119000,120000,121000,122000,123000,124000,125000,126000,127000,128000,129000,130000,131000,132000,133000,134000,135000,136000,137000,138000,139000,140000,141000,142000],"global_num_gpus":256}
System Hardware
CPU count96
GPU count8
GPU typeNVIDIA A100-SXM4-40GB
W&B CLI Version
0.10.28
Config

Config parameters are your model's inputs. Learn more

  • {} 192 keys
    • "gelu"
    • false
    • 1,000
    • null
    • false
    • [] 36 items
      • 0
      • false
      • 1
      • false
      • true
      • false
      • true
      • 1,000
      • false
      • 1
      • "linear"
      • false
      • 1
      • null
      • {} 1 key
        • {} 56 keys
        • false
        • "mmap"
        • null
        • false
        • null
        • true
        • true
        • false
        • true
        • false
        • "nccl"
        • null
        • null
        • null
        • false
        • true
        • false
        • 143,000
        • 10
        • ""
        • null
        • null
        • null
        • [] 11 items
          • false
          • 46 ... 95
            96 ... 145
            146 ... 187
          • {} 8 keys
            • 1,260,000,000
            • true
            • 1
          Summary

          Summary metrics are your model's outputs. Learn more

          No summary metrics saved for this run.

          Check the summary metrics documentation for more information.

          Artifact Outputs

          This run produced these artifacts as outputs. Total: 1. Learn more

          Loading...