Skip to main content

Eleutherai-oslo's group workspace

aavbXsg9eeKo2s6RMF59Eh_p1q023jz

What makes this group special?
Tags

ip-26-0-128-48-0

Notes
State
Crashed
Start time
April 19th, 2023 4:23:40 PM
Runtime
1h 6m 9s
Tracked hours
-
Run path
eleutherai-oslo/polyglot-ko-12_8b/3pglgcwu
OS
Linux-5.15.0-1019-aws-x86_64-with-glibc2.31
Python version
3.9.12
Git repository
git clone https://github.com/EleutherAI/gpt-neox.git
Git state
git checkout -b "ip-26-0-128-48-0" bf9b3012943259407c4274604504193228b3c3f0
Command
/fsx/polyglot.train/gpt-neox/train.py --deepspeed_config "{\"train_batch_size\": 64, \"train_micro_batch_size_per_gpu\": 8, \"gradient_accumulation_steps\": 2, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0001, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true}" --megatron_config "{\"hostfile\": \"/fsx/polyglot.train/hostfiles/hosts_162020\", \"launcher\": \"openmpi\", \"train_batch_size\": 64, \"train_micro_batch_size_per_gpu\": 8, \"gradient_accumulation_steps\": 2, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0001, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true, \"precision\": \"fp16\", \"num_layers\": 36, \"hidden_size\": 5120, \"num_attention_heads\": 40, \"seq_length\": 2048, \"max_position_embeddings\": 2048, \"pos_emb\": \"rotary\", \"no_weight_tying\": true, \"attention_config\": [\"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\"], \"sparsity_config\": {}, \"scaled_upper_triang_masked_softmax_fusion\": true, \"bias_gelu_fusion\": true, \"rotary_ndims\": 64, \"init_method\": \"small_init\", \"output_layer_init_method\": \"wang_init\", \"gpt_j_residual\": true, \"output_layer_parallelism\": \"column\", \"lr_decay_style\": \"cosine\", \"lr_decay_iters\": 500000, \"min_lr\": 1e-05, \"optimizer_type\": \"Adam\", \"zero_stage\": 1, \"zero_reduce_scatter\": true, \"zero_contiguous_gradients\": true, \"zero_reduce_bucket_size\": 500000000, \"zero_allgather_bucket_size\": 500000000, \"lr\": 0.0001, \"tokenizer_type\": \"HFTokenizer\", \"data_path\": \"/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document\", \"data_impl\": \"mmap\", \"save\": \"/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch\", \"config_files\": {\"13B_ko.yml\": \"# GPT-2 pretraining setup\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \\"tokenizer-type\\": \\"HFTokenizer\\",\n \\"vocab-file\\": \\"./tokenizer/MBBPE/tokenizer.json\\",\n \\"save\\": \\"/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch\\",\n # \\"load\\": \\"/fsx/polyglot.train/gpt-neox/checkpoints/6B_scratch\\",\n\n # wandb config\n \\"wandb_team\\": \\"eleutherai-oslo\\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \\"data-path\\": \\"/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document\\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \\"pipe-parallel-size\\": 1,\n \\"model-parallel-size\\": 4,\n\n # model settings\n \\"num-layers\\": 36,\n \\"hidden-size\\": 5120,\n \\"num-attention-heads\\": 40,\n \\"seq-length\\": 2048,\n \\"max-position-embeddings\\": 2048,\n \\"norm\\": \\"layernorm\\",\n \\"pos-emb\\": \\"rotary\\",\n \\"no-weight-tying\\": true,\n \\"rotary_ndims\\": 64,\n \\"gpt_j_residual\\": true,\n \\"output_layer_parallelism\\": \\"column\\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \\"scaled-upper-triang-masked-softmax-fusion\\": true,\n \\"bias-gelu-fusion\\": true,\n\n # init methods\n \\"init_method\\": \\"small_init\\",\n \\"output_layer_init_method\\": \\"wang_init\\",\n\n # optimizer settings\n \\"optimizer\\": {\n \\"type\\": \\"Adam\\",\n \\"params\\": {\n \\"lr\\": 0.0001,\n \\"betas\\": [0.9, 0.95],\n \\"eps\\": 1.0e-8,\n }\n },\n \\"min_lr\\": 0.00001,\n \\"zero_optimization\\": {\n \\"stage\\": 1,\n \\"allgather_partitions\\": True,\n \\"allgather_bucket_size\\": 500000000,\n \\"overlap_comm\\": True,\n \\"reduce_scatter\\": True,\n \\"reduce_bucket_size\\": 500000000,\n \\"contiguous_gradients\\": True,\n \\"cpu_offload\\": False\n },\n\n # batch / data settings\n \\"train_micro_batch_size_per_gpu\\": 8,\n \\"gradient_accumulation_steps\\": 2,\n \\"data-impl\\": \\"mmap\\",\n # \\"split\\": \\"949,50,1\\",\n\n # activation checkpointing\n \\"checkpoint-activations\\": true,\n \\"checkpoint-num-layers\\": 1,\n \\"partition-activations\\": true,\n \\"synchronize-each-layer\\": true,\n\n # regularization\n \\"gradient_clipping\\": 1.0,\n \\"weight-decay\\": 0.1,\n \\"hidden-dropout\\": 0,\n \\"attention-dropout\\": 0,\n\n # precision settings\n # \\"attention_softmax_in_fp32\\": true,\n \\"fp16\\": {\n \\"fp16\\": true,\n \\"enabled\\": true,\n \\"initial_scale_power\\": 32,\n \\"loss_scale_window\\": 1000,\n \\"hysteresis\\": 2,\n \\"min_loss_scale\\": 1\n },\n\n # misc. training settings\n \\"train-iters\\": 500000,\n \\"lr-decay-iters\\": 500000,\n \\"distributed-backend\\": \\"nccl\\",\n \\"lr-decay-style\\": \\"cosine\\",\n \\"warmup\\": 0.01,\n \\"save-interval\\": 1000,\n \\"eval-interval\\": 1000,\n \\"eval-tasks-interval\\": 50000000000,\n \\"eval-iters\\": 10,\n\n # logging\n \\"log-interval\\": 100,\n \\"steps_per_print\\": 10,\n \\"keep-last-n-checkpoints\\": 5,\n \\"wall_clock_breakdown\\": true,\n\n # wandb\n \\"use_wandb\\": true,\n # \\"wandb_init_all_ranks\\": true,\n \\"wandb_project\\": \\"polyglot-ko-12_8b\\",\n \\"eval_tasks\\": [\\"nsmc\\"],\n\n # deepspeed launcher\n \\"launcher\\": \\"openmpi\\",\n \\"deepspeed_mpi\\": true\n}\n\"}, \"save_interval\": 1000, \"batch_size\": 8, \"train_iters\": 500000, \"eval_iters\": 10, \"keep_last_n_checkpoints\": 5, \"eval_tasks_interval\": 50000000000, \"vocab_file\": \"./tokenizer/MBBPE/tokenizer.json\", \"attention_dropout\": 0, \"hidden_dropout\": 0, \"weight_decay\": 0.1, \"checkpoint_activations\": true, \"synchronize_each_layer\": true, \"partition_activations\": true, \"gas\": 2, \"clip_grad\": 1.0, \"dynamic_loss_scale\": true, \"pipe_parallel_size\": 1, \"model_parallel_size\": 4, \"is_pipe_parallel\": true, \"use_wandb\": true, \"wandb_group\": \"aavbXsg9eeKo2s6RMF59Eh_p1q023jz\", \"wandb_team\": \"eleutherai-oslo\", \"wandb_project\": \"polyglot-ko-12_8b\", \"log_interval\": 100, \"text_gen_type\": \"unconditional\", \"eval_tasks\": [\"nsmc\"], \"deepspeed_mpi\": true, \"user_script\": \"/fsx/polyglot.train/gpt-neox/train.py\", \"global_num_gpus\": 16}"
System Hardware
CPU count96
GPU count8
GPU typeNVIDIA A100-SXM4-40GB
W&B CLI Version
0.12.21
Config

Config parameters are your model's inputs. Learn more

  • {} 185 keys
    • "gelu"
    • false
    • 1,000
    • null
    • false
    • [] 36 items
      • 0
      • false
      • 8
      • false
      • true
      • false
      • true
      • false
      • 1
      • false
      • 1
      • {} 1 key
        • "# GPT-2 pretraining setup { # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in "tokenizer-type": "HFTokenizer", "vocab-file": "./tokenizer/MBBPE/tokenizer.json", "save": "/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch", # "load": "/fsx/polyglot.train/gpt-neox/checkpoints/6B_scratch", # wandb config "wandb_team": "eleutherai-oslo", # If finetuning, edit the following to the location of your finetuning dataset: "data-path": "/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document", # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 1, "model-parallel-size": 4, # model settings "num-layers": 36, "hidden-size": 5120, "num-attention-heads": 40, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "no-weight-tying": true, "rotary_ndims": 64, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": true, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.0001, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.00001, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 2, "data-impl": "mmap", # "split": "949,50,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": true, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.1, "hidden-dropout": 0, "attention-dropout": 0, # precision settings # "attention_softmax_in_fp32": true, "fp16": { "fp16": true, "enabled": true, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 500000, "lr-decay-iters": 500000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "save-interval": 1000, "eval-interval": 1000, "eval-tasks-interval": 50000000000, "eval-iters": 10, # logging "log-interval": 100, "steps_per_print": 10, "keep-last-n-checkpoints": 5, "wall_clock_breakdown": true, # wandb "use_wandb": true, # "wandb_init_all_ranks": true, "wandb_project": "polyglot-ko-12_8b", "eval_tasks": ["nsmc"], # deepspeed launcher "launcher": "openmpi", "deepspeed_mpi": true } "
      • false
      • "mmap"
      • "/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document"
      • false
      • null
      • true
      • true
      • true
      • false
      • "nccl"
      • null
      • null
      • null
      • false
      • true
      • false
      • 1,000
      • 10
      • ""
      • [] 1 item
        • "nsmc"
      • 50,000,000,000
      • null
      • null
      • false
      • null
      • {} 6 keys
        • false
        • false
        • 46 ... 95
          96 ... 145
          146 ... 180
        • {} 8 keys
          • 500,000,000
          • true
          • 1
        Summary

        Summary metrics are your model's outputs. Learn more

        No summary metrics saved for this run.

        Check the summary metrics documentation for more information.

        Artifact Outputs

        This run produced these artifacts as outputs. Total: 1. Learn more

        Loading...