Eleutherai-oslo's group workspace
bbmEhMAwoKq4TkRTSAByxf_srvwlu70
What makes this group special?
Tags
ip-26-0-132-37-0
Notes
Author
State
Crashed
Start time
April 20th, 2023 2:35:31 AM
Runtime
14m 19s
Tracked hours
-
Run path
eleutherai-oslo/polyglot-ko-12_8b/3obrt2cj
OS
Linux-5.15.0-1019-aws-x86_64-with-glibc2.31
Python version
3.9.16
Git repository
git clone https://github.com/EleutherAI/gpt-neox.git
Git state
git checkout -b "ip-26-0-132-37-0" bf9b3012943259407c4274604504193228b3c3f0
Command
/fsx/polyglot.train/gpt-neox/train.py --deepspeed_config "{\"train_batch_size\": 64, \"train_micro_batch_size_per_gpu\": 8, \"gradient_accumulation_steps\": 2, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0001, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true}" --megatron_config "{\"hostfile\": \"/fsx/polyglot.train/hostfiles/hosts_163643\", \"launcher\": \"openmpi\", \"train_batch_size\": 64, \"train_micro_batch_size_per_gpu\": 8, \"gradient_accumulation_steps\": 2, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0001, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true, \"precision\": \"fp16\", \"num_layers\": 36, \"hidden_size\": 5120, \"num_attention_heads\": 40, \"seq_length\": 2048, \"max_position_embeddings\": 2048, \"pos_emb\": \"rotary\", \"no_weight_tying\": true, \"attention_config\": [\"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\"], \"sparsity_config\": {}, \"scaled_upper_triang_masked_softmax_fusion\": true, \"bias_gelu_fusion\": true, \"rotary_ndims\": 64, \"init_method\": \"small_init\", \"output_layer_init_method\": \"wang_init\", \"gpt_j_residual\": true, \"output_layer_parallelism\": \"column\", \"lr_decay_style\": \"cosine\", \"lr_decay_iters\": 500000, \"min_lr\": 1e-05, \"optimizer_type\": \"Adam\", \"zero_stage\": 1, \"zero_reduce_scatter\": true, \"zero_contiguous_gradients\": true, \"zero_reduce_bucket_size\": 500000000, \"zero_allgather_bucket_size\": 500000000, \"lr\": 0.0001, \"tokenizer_type\": \"HFTokenizer\", \"data_path\": \"/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document\", \"data_impl\": \"mmap\", \"save\": \"/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch\", \"config_files\": {\"13B_ko.yml\": \"# GPT-2 pretraining setup\n{\n  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in\n  \\"tokenizer-type\\": \\"HFTokenizer\\",\n  \\"vocab-file\\": \\"./tokenizer/MBBPE/tokenizer.json\\",\n  \\"save\\": \\"/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch\\",\n  # \\"load\\": \\"/fsx/polyglot.train/gpt-neox/checkpoints/6B_scratch\\",\n\n  # wandb config\n  \\"wandb_team\\": \\"eleutherai-oslo\\",\n\n  # If finetuning, edit the following to the location of your finetuning dataset:\n  \\"data-path\\": \\"/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document\\",\n   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n   # across the node boundaries )\n   \\"pipe-parallel-size\\": 1,\n   \\"model-parallel-size\\": 4,\n\n   # model settings\n   \\"num-layers\\": 36,\n   \\"hidden-size\\": 5120,\n   \\"num-attention-heads\\": 40,\n   \\"seq-length\\": 2048,\n   \\"max-position-embeddings\\": 2048,\n   \\"norm\\": \\"layernorm\\",\n   \\"pos-emb\\": \\"rotary\\",\n   \\"no-weight-tying\\": true,\n   \\"rotary_ndims\\": 64,\n   \\"gpt_j_residual\\": true,\n   \\"output_layer_parallelism\\": \\"column\\",\n\n   # these should provide some speedup but takes a while to build, set to true if desired\n   \\"scaled-upper-triang-masked-softmax-fusion\\": true,\n   \\"bias-gelu-fusion\\": true,\n\n   # init methods\n   \\"init_method\\": \\"small_init\\",\n   \\"output_layer_init_method\\": \\"wang_init\\",\n\n   # optimizer settings\n   \\"optimizer\\": {\n     \\"type\\": \\"Adam\\",\n     \\"params\\": {\n       \\"lr\\": 0.0001,\n       \\"betas\\": [0.9, 0.95],\n       \\"eps\\": 1.0e-8,\n     }\n   },\n   \\"min_lr\\": 0.00001,\n   \\"zero_optimization\\": {\n    \\"stage\\": 1,\n    \\"allgather_partitions\\": True,\n    \\"allgather_bucket_size\\": 500000000,\n    \\"overlap_comm\\": True,\n    \\"reduce_scatter\\": True,\n    \\"reduce_bucket_size\\": 500000000,\n    \\"contiguous_gradients\\": True,\n    \\"cpu_offload\\": False\n  },\n\n   # batch / data settings\n   \\"train_micro_batch_size_per_gpu\\": 8,\n   \\"gradient_accumulation_steps\\": 2,\n   \\"data-impl\\": \\"mmap\\",\n   # \\"split\\": \\"949,50,1\\",\n\n   # activation checkpointing\n   \\"checkpoint-activations\\": true,\n   \\"checkpoint-num-layers\\": 1,\n   \\"partition-activations\\": true,\n   \\"synchronize-each-layer\\": true,\n\n   # regularization\n   \\"gradient_clipping\\": 1.0,\n   \\"weight-decay\\": 0.1,\n   \\"hidden-dropout\\": 0,\n   \\"attention-dropout\\": 0,\n\n   # precision settings\n   # \\"attention_softmax_in_fp32\\": true,\n   \\"fp16\\": {\n     \\"fp16\\": true,\n     \\"enabled\\": true,\n     \\"initial_scale_power\\": 32,\n     \\"loss_scale_window\\": 1000,\n     \\"hysteresis\\": 2,\n     \\"min_loss_scale\\": 1\n   },\n\n   # misc. training settings\n   \\"train-iters\\": 500000,\n   \\"lr-decay-iters\\": 500000,\n   \\"distributed-backend\\": \\"nccl\\",\n   \\"lr-decay-style\\": \\"cosine\\",\n   \\"warmup\\": 0.01,\n   \\"save-interval\\": 1000,\n   \\"eval-interval\\": 1000,\n   \\"eval-tasks-interval\\": 50000000000,\n   \\"eval-iters\\": 10,\n\n   # logging\n   \\"log-interval\\": 100,\n   \\"steps_per_print\\": 10,\n   \\"keep-last-n-checkpoints\\": 5,\n   \\"wall_clock_breakdown\\": true,\n\n   # wandb\n   \\"use_wandb\\": true,\n   #  \\"wandb_init_all_ranks\\": true,\n   \\"wandb_project\\": \\"polyglot-ko-12_8b\\",\n   \\"eval_tasks\\": [\\"nsmc\\"],\n\n   # deepspeed launcher\n   \\"launcher\\": \\"openmpi\\",\n   \\"deepspeed_mpi\\": true\n}\n\"}, \"save_interval\": 1000, \"batch_size\": 8, \"train_iters\": 500000, \"eval_iters\": 10, \"keep_last_n_checkpoints\": 5, \"eval_tasks_interval\": 50000000000, \"vocab_file\": \"./tokenizer/MBBPE/tokenizer.json\", \"attention_dropout\": 0, \"hidden_dropout\": 0, \"weight_decay\": 0.1, \"checkpoint_activations\": true, \"synchronize_each_layer\": true, \"partition_activations\": true, \"gas\": 2, \"clip_grad\": 1.0, \"dynamic_loss_scale\": true, \"pipe_parallel_size\": 1, \"model_parallel_size\": 4, \"is_pipe_parallel\": true, \"use_wandb\": true, \"wandb_group\": \"bbmEhMAwoKq4TkRTSAByxf_srvwlu70\", \"wandb_team\": \"eleutherai-oslo\", \"wandb_project\": \"polyglot-ko-12_8b\", \"log_interval\": 100, \"text_gen_type\": \"unconditional\", \"eval_tasks\": [\"nsmc\"], \"deepspeed_mpi\": true, \"user_script\": \"/fsx/polyglot.train/gpt-neox/train.py\", \"global_num_gpus\": 16}"
System Hardware
| CPU count | 96 | 
| GPU count | 8 | 
| GPU type | NVIDIA A100-SXM4-40GB | 
W&B CLI Version
0.12.21
Config
Config parameters are your model's inputs. Learn more
- {} 185 keys▶- "gelu"
- false
- 1,000
- null
- false
- [] 36 items▶
- 0
- false
- 8
- false
- true
- false
- true
- false
- 1
- false
- 1
- {} 1 key▶- "# GPT-2 pretraining setup { # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in "tokenizer-type": "HFTokenizer", "vocab-file": "./tokenizer/MBBPE/tokenizer.json", "save": "/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch", # "load": "/fsx/polyglot.train/gpt-neox/checkpoints/6B_scratch", # wandb config "wandb_team": "eleutherai-oslo", # If finetuning, edit the following to the location of your finetuning dataset: "data-path": "/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document", # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 1, "model-parallel-size": 4, # model settings "num-layers": 36, "hidden-size": 5120, "num-attention-heads": 40, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "no-weight-tying": true, "rotary_ndims": 64, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": true, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.0001, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.00001, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 2, "data-impl": "mmap", # "split": "949,50,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": true, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.1, "hidden-dropout": 0, "attention-dropout": 0, # precision settings # "attention_softmax_in_fp32": true, "fp16": { "fp16": true, "enabled": true, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 500000, "lr-decay-iters": 500000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "save-interval": 1000, "eval-interval": 1000, "eval-tasks-interval": 50000000000, "eval-iters": 10, # logging "log-interval": 100, "steps_per_print": 10, "keep-last-n-checkpoints": 5, "wall_clock_breakdown": true, # wandb "use_wandb": true, # "wandb_init_all_ranks": true, "wandb_project": "polyglot-ko-12_8b", "eval_tasks": ["nsmc"], # deepspeed launcher "launcher": "openmpi", "deepspeed_mpi": true } "
 
- false
- "mmap"
- "/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document"
- false
- null
- true
- true
- true
- false
- "nccl"
- null
- null
- null
- false
- true
- false
- 1,000
- 10
- ""
- [] 1 item▶- "nsmc"
 
- 50,000,000,000
- null
- null
- false
- null
- {} 6 keys▶
- false
- false
- {} 8 keys▶
- 500,000,000
- true
- 1
 46 ... 95▶▶96 ... 145▶▶146 ... 180▶▶
Summary
Summary metrics are your model's outputs. Learn more
No summary metrics saved for this run.
Check the summary metrics documentation for more information.
Artifact Outputs
This run produced these artifacts as outputs. Total: 1. Learn more
Type
Name
Consumer count
Loading...