Skip to main content

Eleutherai-oslo's group workspace

polyglot-ko-1.3B

What makes this group special?
Tags

gpu-st-p4d-24xlarge-39-0

Notes
State
Crashed
Start time
September 10th, 2022 10:53:53 AM
Runtime
2d 6h 2m 52s
Tracked hours
-
Run path
eleutherai/polyglot-ko/sce7oipg
OS
Linux-5.10.126-117.518.amzn2.x86_64-x86_64-with-glibc2.26
Python version
3.9.12
Git repository
git clone https://github.com/EleutherAI/gpt-neox.git
Git state
git checkout -b "gpu-st-p4d-24xlarge-39-0" bf9b3012943259407c4274604504193228b3c3f0
Command
/fsx/multi-lingual-6b/gpt-neox/train.py --deepspeed_config "{\"train_batch_size\": 1024, \"train_micro_batch_size_per_gpu\": 4, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0002, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"loss_scale\": 0, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true}" --megatron_config "{\"hostfile\": \"/fsx/multi-lingual-6b/hostfiles/hosts_9982\", \"launcher\": \"openmpi\", \"train_batch_size\": 1024, \"train_micro_batch_size_per_gpu\": 4, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 0.0002, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"loss_scale\": 0, \"initial_scale_power\": 32, \"loss_scale_window\": 1000, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 500000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 500000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"wall_clock_breakdown\": true, \"precision\": \"fp16\", \"num_layers\": 24, \"hidden_size\": 2048, \"num_attention_heads\": 16, \"seq_length\": 2048, \"max_position_embeddings\": 2048, \"pos_emb\": \"rotary\", \"no_weight_tying\": true, \"attention_config\": [\"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\"], \"sparsity_config\": {}, \"bias_gelu_fusion\": true, \"attention_softmax_in_fp32\": true, \"rotary_ndims\": 64, \"gpt_j_residual\": true, \"output_layer_parallelism\": \"column\", \"lr_decay_style\": \"cosine\", \"lr_decay_iters\": 320000, \"optimizer_type\": \"Adam\", \"zero_stage\": 1, \"zero_reduce_scatter\": true, \"zero_contiguous_gradients\": true, \"zero_reduce_bucket_size\": 500000000, \"zero_allgather_bucket_size\": 500000000, \"lr\": 0.0002, \"tokenizer_type\": \"HFTokenizer\", \"data_path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\", \"data_impl\": \"mmap\", \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\", \"config_files\": {\"1B_ko.yml\": \"# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \\"tokenizer-type\\": \\"HFTokenizer\\",\n \\"vocab-file\\": \\"./tokenizer/MBBPE/tokenizer.json\\",\n \\"save\\": \\"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\\",\n \\"load\\": \\"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\\",\n\n # wandb config\n \\"wandb_team\\": \\"eleutherai-oslo\\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \\"data-path\\": \\"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \\"is_pipe_parallel\\": false,\n \\"pipe-parallel-size\\": 1,\n \\"model-parallel-size\\": 1,\n\n # model settings\n \\"num-layers\\": 24,\n \\"hidden-size\\": 2048,\n \\"num-attention-heads\\": 16,\n \\"seq-length\\": 2048,\n \\"max-position-embeddings\\": 2048,\n \\"norm\\": \\"layernorm\\",\n \\"pos-emb\\": \\"rotary\\",\n \\"no-weight-tying\\": true,\n \\"rotary_ndims\\": 64,\n \\"gpt_j_residual\\": true,\n \\"output_layer_parallelism\\": \\"column\\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \\"scaled-upper-triang-masked-softmax-fusion\\": false,\n \\"bias-gelu-fusion\\": true,\n\n\n # optimizer settings\n \\"optimizer\\": {\n \\"type\\": \\"Adam\\",\n \\"params\\": {\n \\"lr\\": 2.0e-4,\n \\"betas\\": [0.9, 0.95],\n \\"eps\\": 1.0e-8,\n }\n },\n \\"zero_optimization\\": {\n \\"stage\\": 1,\n \\"allgather_partitions\\": True,\n \\"allgather_bucket_size\\": 500000000,\n \\"overlap_comm\\": True,\n \\"reduce_scatter\\": True,\n \\"reduce_bucket_size\\": 500000000,\n \\"contiguous_gradients\\": True,\n \\"cpu_offload\\": False\n },\n\n # batch / data settings\n \\"train_micro_batch_size_per_gpu\\": 4,\n \\"gradient_accumulation_steps\\": 1,\n \\"data-impl\\": \\"mmap\\",\n \\"split\\": \\"949,50,1\\",\n\n # activation checkpointing\n \\"checkpoint-activations\\": true,\n \\"checkpoint-num-layers\\": 1,\n \\"partition-activations\\": true,\n \\"synchronize-each-layer\\": true,\n\n # regularization\n \\"gradient_clipping\\": 1.0,\n \\"weight-decay\\": 0.01,\n \\"hidden-dropout\\": 0,\n \\"attention-dropout\\": 0,\n\n # precision settings\n \\"attention_softmax_in_fp32\\": true,\n \\"fp16\\": {\n \\"fp16\\": true,\n \\"enabled\\": true,\n \\"loss_scale\\": 0,\n \\"initial_scale_power\\": 32,\n \\"loss_scale_window\\": 1000,\n \\"hysteresis\\": 2,\n \\"min_loss_scale\\": 1\n },\n\n # misc. training settings\n \\"train-iters\\": 320000,\n \\"lr-decay-iters\\": 320000,\n \\"distributed-backend\\": \\"nccl\\",\n \\"lr-decay-style\\": \\"cosine\\",\n \\"warmup\\": 0.01,\n \\"save-interval\\": 500,\n \\"eval-interval\\": 1000,\n \\"eval-tasks-interval\\": 50000000000,\n \\"eval-iters\\": 10,\n\n # logging\n \\"log-interval\\": 100,\n \\"steps_per_print\\": 10,\n \\"keep-last-n-checkpoints\\": 4,\n \\"wall_clock_breakdown\\": true,\n\n # wandb\n \\"use_wandb\\": true,\n \\"wandb_project\\": \\"gpt-neox-ko-1b\\",\n \\"eval_tasks\\": [\\"nsmc\\"],\n\n # deepspeed launcher\n \\"launcher\\": \\"openmpi\\",\n \\"deepspeed_mpi\\": true\n}\n\"}, \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\", \"save_interval\": 500, \"batch_size\": 4, \"train_iters\": 320000, \"eval_iters\": 10, \"keep_last_n_checkpoints\": 4, \"eval_tasks_interval\": 50000000000, \"split\": \"949,50,1\", \"vocab_file\": \"./tokenizer/MBBPE/tokenizer.json\", \"attention_dropout\": 0, \"hidden_dropout\": 0, \"checkpoint_activations\": true, \"synchronize_each_layer\": true, \"partition_activations\": true, \"gas\": 1, \"clip_grad\": 1.0, \"dynamic_loss_scale\": true, \"pipe_parallel_size\": 1, \"is_pipe_parallel\": true, \"use_wandb\": true, \"wandb_group\": \"9zSd2A5esuu6jh8iTrEaJN_1agv0phc\", \"wandb_team\": \"eleutherai-oslo\", \"wandb_project\": \"gpt-neox-ko-1b\", \"log_interval\": 100, \"text_gen_type\": \"unconditional\", \"eval_tasks\": [\"nsmc\"], \"deepspeed_mpi\": true, \"user_script\": \"/fsx/multi-lingual-6b/gpt-neox/train.py\", \"global_num_gpus\": 256}"
System Hardware
CPU count48
GPU count8
GPU typeNVIDIA A100-SXM4-40GB
W&B CLI Version
0.12.21
Config

Config parameters are your model's inputs. Learn more

  • {} 185 keys
    • "gelu"
    • false
    • 1,000
    • null
    • false
    • [] 24 items
      • 0
      • true
      • 4
      • false
      • true
      • false
      • true
      • false
      • 1
      • false
      • 1
      • {} 1 key
        • "# Same as GPT-Neo 1.3B { # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in "tokenizer-type": "HFTokenizer", "vocab-file": "./tokenizer/MBBPE/tokenizer.json", "save": "/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch", "load": "/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch", # wandb config "wandb_team": "eleutherai-oslo", # If finetuning, edit the following to the location of your finetuning dataset: "data-path": "/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document", # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "is_pipe_parallel": false, "pipe-parallel-size": 1, "model-parallel-size": 1, # model settings "num-layers": 24, "hidden-size": 2048, "num-attention-heads": 16, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "no-weight-tying": true, "rotary_ndims": 64, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 2.0e-4, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 1, "data-impl": "mmap", "split": "949,50,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": true, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "attention_softmax_in_fp32": true, "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 32, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 320000, "lr-decay-iters": 320000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "save-interval": 500, "eval-interval": 1000, "eval-tasks-interval": 50000000000, "eval-iters": 10, # logging "log-interval": 100, "steps_per_print": 10, "keep-last-n-checkpoints": 4, "wall_clock_breakdown": true, # wandb "use_wandb": true, "wandb_project": "gpt-neox-ko-1b", "eval_tasks": ["nsmc"], # deepspeed launcher "launcher": "openmpi", "deepspeed_mpi": true } "
      • false
      • "mmap"
      • "/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document"
      • false
      • null
      • true
      • true
      • true
      • false
      • "nccl"
      • null
      • null
      • null
      • false
      • true
      • false
      • 1,000
      • 10
      • ""
      • [] 1 item
        • "nsmc"
      • 50,000,000,000
      • null
      • null
      • false
      • null
      • {} 7 keys
        • false
        • false
        • 46 ... 95
          96 ... 145
          146 ... 180
        • {} 8 keys
          • 500,000,000
          • true
          • 1
        Summary

        Summary metrics are your model's outputs. Learn more

        No summary metrics saved for this run.

        Check the summary metrics documentation for more information.

        Artifact Outputs

        This run produced these artifacts as outputs. Total: 1. Learn more

        Loading...