Eleutherai-oslo's group workspace
NBe5KPwXzrYMxyMub69E5D_2sgn7hji
What makes this group special?
Tags
ip-26-0-143-225-0
Notes
Author
State
Failed
Start time
April 20th, 2023 2:23:36 AM
Runtime
9s
Tracked hours
7s
Run path
eleutherai-oslo/polyglot-ko-12_8b/3um2in1e
OS
Linux-5.15.0-1019-aws-x86_64-with-glibc2.31
Python version
3.9.16
Git repository
git clone https://github.com/EleutherAI/gpt-neox.git
Git state
git checkout -b "ip-26-0-143-225-0" bf9b3012943259407c4274604504193228b3c3f0
Command
/fsx/polyglot.train/gpt-neox/train.py --deepspeed_config "{\"train_batch_size\": 64, \"train_micro_batch_size_per_gpu\": 8, \"gradient_accumulation_steps\": 2, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0001, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true}" --megatron_config "{\"hostfile\": \"/fsx/polyglot.train/hostfiles/hosts_163642\", \"launcher\": \"openmpi\", \"train_batch_size\": 64, \"train_micro_batch_size_per_gpu\": 8, \"gradient_accumulation_steps\": 2, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0001, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true, \"precision\": \"fp16\", \"num_layers\": 36, \"hidden_size\": 5120, \"num_attention_heads\": 40, \"seq_length\": 2048, \"max_position_embeddings\": 2048, \"pos_emb\": \"rotary\", \"no_weight_tying\": true, \"attention_config\": [\"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\"], \"sparsity_config\": {}, \"scaled_upper_triang_masked_softmax_fusion\": true, \"bias_gelu_fusion\": true, \"rotary_ndims\": 64, \"init_method\": \"small_init\", \"output_layer_init_method\": \"wang_init\", \"gpt_j_residual\": true, \"output_layer_parallelism\": \"column\", \"lr_decay_style\": \"cosine\", \"lr_decay_iters\": 500000, \"min_lr\": 1e-05, \"optimizer_type\": \"Adam\", \"zero_stage\": 1, \"zero_reduce_scatter\": true, \"zero_contiguous_gradients\": true, \"zero_reduce_bucket_size\": 500000000, \"zero_allgather_bucket_size\": 500000000, \"lr\": 0.0001, \"tokenizer_type\": \"HFTokenizer\", \"data_path\": \"/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document\", \"data_impl\": \"mmap\", \"save\": \"/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch\", \"config_files\": {\"13B_ko.yml\": \"# GPT-2 pretraining setup\n{\n  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in\n  \\"tokenizer-type\\": \\"HFTokenizer\\",\n  \\"vocab-file\\": \\"./tokenizer/MBBPE/tokenizer.json\\",\n  \\"save\\": \\"/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch\\",\n  # \\"load\\": \\"/fsx/polyglot.train/gpt-neox/checkpoints/6B_scratch\\",\n\n  # wandb config\n  \\"wandb_team\\": \\"eleutherai-oslo\\",\n\n  # If finetuning, edit the following to the location of your finetuning dataset:\n  \\"data-path\\": \\"/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document\\",\n   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n   # across the node boundaries )\n   \\"pipe-parallel-size\\": 1,\n   \\"model-parallel-size\\": 4,\n\n   # model settings\n   \\"num-layers\\": 36,\n   \\"hidden-size\\": 5120,\n   \\"num-attention-heads\\": 40,\n   \\"seq-length\\": 2048,\n   \\"max-position-embeddings\\": 2048,\n   \\"norm\\": \\"layernorm\\",\n   \\"pos-emb\\": \\"rotary\\",\n   \\"no-weight-tying\\": true,\n   \\"rotary_ndims\\": 64,\n   \\"gpt_j_residual\\": true,\n   \\"output_layer_parallelism\\": \\"column\\",\n\n   # these should provide some speedup but takes a while to build, set to true if desired\n   \\"scaled-upper-triang-masked-softmax-fusion\\": true,\n   \\"bias-gelu-fusion\\": true,\n\n   # init methods\n   \\"init_method\\": \\"small_init\\",\n   \\"output_layer_init_method\\": \\"wang_init\\",\n\n   # optimizer settings\n   \\"optimizer\\": {\n     \\"type\\": \\"Adam\\",\n     \\"params\\": {\n       \\"lr\\": 0.0001,\n       \\"betas\\": [0.9, 0.95],\n       \\"eps\\": 1.0e-8,\n     }\n   },\n   \\"min_lr\\": 0.00001,\n   \\"zero_optimization\\": {\n    \\"stage\\": 1,\n    \\"allgather_partitions\\": True,\n    \\"allgather_bucket_size\\": 500000000,\n    \\"overlap_comm\\": True,\n    \\"reduce_scatter\\": True,\n    \\"reduce_bucket_size\\": 500000000,\n    \\"contiguous_gradients\\": True,\n    \\"cpu_offload\\": False\n  },\n\n   # batch / data settings\n   \\"train_micro_batch_size_per_gpu\\": 8,\n   \\"gradient_accumulation_steps\\": 2,\n   \\"data-impl\\": \\"mmap\\",\n   # \\"split\\": \\"949,50,1\\",\n\n   # activation checkpointing\n   \\"checkpoint-activations\\": true,\n   \\"checkpoint-num-layers\\": 1,\n   \\"partition-activations\\": true,\n   \\"synchronize-each-layer\\": true,\n\n   # regularization\n   \\"gradient_clipping\\": 1.0,\n   \\"weight-decay\\": 0.1,\n   \\"hidden-dropout\\": 0,\n   \\"attention-dropout\\": 0,\n\n   # precision settings\n   # \\"attention_softmax_in_fp32\\": true,\n   \\"fp16\\": {\n     \\"fp16\\": true,\n     \\"enabled\\": true,\n     \\"initial_scale_power\\": 32,\n     \\"loss_scale_window\\": 1000,\n     \\"hysteresis\\": 2,\n     \\"min_loss_scale\\": 1\n   },\n\n   # misc. training settings\n   \\"train-iters\\": 500000,\n   \\"lr-decay-iters\\": 500000,\n   \\"distributed-backend\\": \\"nccl\\",\n   \\"lr-decay-style\\": \\"cosine\\",\n   \\"warmup\\": 0.01,\n   \\"save-interval\\": 1000,\n   \\"eval-interval\\": 1000,\n   \\"eval-tasks-interval\\": 50000000000,\n   \\"eval-iters\\": 10,\n\n   # logging\n   \\"log-interval\\": 100,\n   \\"steps_per_print\\": 10,\n   \\"keep-last-n-checkpoints\\": 5,\n   \\"wall_clock_breakdown\\": true,\n\n   # wandb\n   \\"use_wandb\\": true,\n   #  \\"wandb_init_all_ranks\\": true,\n   \\"wandb_project\\": \\"polyglot-ko-12_8b\\",\n   \\"eval_tasks\\": [\\"nsmc\\"],\n\n   # deepspeed launcher\n   \\"launcher\\": \\"openmpi\\",\n   \\"deepspeed_mpi\\": true\n}\n\"}, \"save_interval\": 1000, \"batch_size\": 8, \"train_iters\": 500000, \"eval_iters\": 10, \"keep_last_n_checkpoints\": 5, \"eval_tasks_interval\": 50000000000, \"vocab_file\": \"./tokenizer/MBBPE/tokenizer.json\", \"attention_dropout\": 0, \"hidden_dropout\": 0, \"weight_decay\": 0.1, \"checkpoint_activations\": true, \"synchronize_each_layer\": true, \"partition_activations\": true, \"gas\": 2, \"clip_grad\": 1.0, \"dynamic_loss_scale\": true, \"pipe_parallel_size\": 1, \"model_parallel_size\": 4, \"is_pipe_parallel\": true, \"use_wandb\": true, \"wandb_group\": \"NBe5KPwXzrYMxyMub69E5D_2sgn7hji\", \"wandb_team\": \"eleutherai-oslo\", \"wandb_project\": \"polyglot-ko-12_8b\", \"log_interval\": 100, \"text_gen_type\": \"unconditional\", \"eval_tasks\": [\"nsmc\"], \"deepspeed_mpi\": true, \"user_script\": \"/fsx/polyglot.train/gpt-neox/train.py\", \"global_num_gpus\": 16}"
System Hardware
| CPU count | 96 | 
| GPU count | 8 | 
| GPU type | NVIDIA A100-SXM4-40GB | 
W&B CLI Version
0.12.21
Config
Config parameters are your model's inputs. Learn more
- {} 185 keys▶- "gelu"
- false
- 1,000
- null
- false
- [] 36 items▶
- 0
- false
- 8
- false
- true
- false
- true
- false
- 1
- false
- 1
- {} 1 key▶- "# GPT-2 pretraining setup { # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in "tokenizer-type": "HFTokenizer", "vocab-file": "./tokenizer/MBBPE/tokenizer.json", "save": "/fsx/polyglot.train/gpt-neox/checkpoints/13B_scratch", # "load": "/fsx/polyglot.train/gpt-neox/checkpoints/6B_scratch", # wandb config "wandb_team": "eleutherai-oslo", # If finetuning, edit the following to the location of your finetuning dataset: "data-path": "/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document", # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 1, "model-parallel-size": 4, # model settings "num-layers": 36, "hidden-size": 5120, "num-attention-heads": 40, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "no-weight-tying": true, "rotary_ndims": 64, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": true, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.0001, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.00001, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 2, "data-impl": "mmap", # "split": "949,50,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": true, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.1, "hidden-dropout": 0, "attention-dropout": 0, # precision settings # "attention_softmax_in_fp32": true, "fp16": { "fp16": true, "enabled": true, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 500000, "lr-decay-iters": 500000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "save-interval": 1000, "eval-interval": 1000, "eval-tasks-interval": 50000000000, "eval-iters": 10, # logging "log-interval": 100, "steps_per_print": 10, "keep-last-n-checkpoints": 5, "wall_clock_breakdown": true, # wandb "use_wandb": true, # "wandb_init_all_ranks": true, "wandb_project": "polyglot-ko-12_8b", "eval_tasks": ["nsmc"], # deepspeed launcher "launcher": "openmpi", "deepspeed_mpi": true } "
 
- false
- "mmap"
- "/fsx/polyglot.train/gpt-neox/processed/multi_ko_13b_text_document"
- false
- null
- true
- true
- true
- false
- "nccl"
- null
- null
- null
- false
- true
- false
- 1,000
- 10
- ""
- [] 1 item▶- "nsmc"
 
- 50,000,000,000
- null
- null
- false
- null
- {} 6 keys▶
- false
- false
- {} 8 keys▶
- 500,000,000
- true
- 1
 46 ... 95▶▶96 ... 145▶▶146 ... 180▶▶
Artifact Outputs
This run produced these artifacts as outputs. Total: 1. Learn more
Type
Name
Consumer count
Loading...