Skip to main content

Igoro's group workspace

P3_2jmuwy18

What makes this group special?
Tags

finetune-4-0

Notes
State
Finished
Start time
February 27th, 2022 4:45:25 PM
Runtime
9h 57m 51s
Tracked hours
-
Run path
eleutherai/gpt-thicc/1x2s3gwz
OS
Linux-5.11.0-34-generic-x86_64-with-glibc2.29
Python version
3.8.10
Git repository
git clone https://github.com/EleutherAI/gpt-neox.git
Git state
git checkout -b "finetune-4-0" f6c611f3211521fa7b145950ea100f44a2d0ead6
Command
train.py --local_rank=0 --deepspeed_config "{\"train_batch_size\": 384, \"train_micro_batch_size_per_gpu\": 4, \"gradient_accumulation_steps\": 16, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 9.7e-08, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"loss_scale\": 0, \"loss_scale_window\": 1000, \"initial_scale_power\": 12, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 1260000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 1260000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"steps_per_print\": 2}" --megatron_config "{\"train_batch_size\": 384, \"train_micro_batch_size_per_gpu\": 4, \"gradient_accumulation_steps\": 16, \"optimizer\": {\"type\": \"Adam\", \"params\": {\"lr\": 9.7e-08, \"betas\": [0.9, 0.95], \"eps\": 1e-08}}, \"fp16\": {\"fp16\": true, \"enabled\": true, \"loss_scale\": 0, \"loss_scale_window\": 1000, \"initial_scale_power\": 12, \"hysteresis\": 2, \"min_loss_scale\": 1}, \"gradient_clipping\": 1.0, \"zero_optimization\": {\"stage\": 1, \"allgather_partitions\": true, \"allgather_bucket_size\": 1260000000, \"overlap_comm\": true, \"reduce_scatter\": true, \"reduce_bucket_size\": 1260000000, \"contiguous_gradients\": true, \"cpu_offload\": false}, \"steps_per_print\": 2, \"precision\": \"fp16\", \"num_layers\": 44, \"hidden_size\": 6144, \"num_attention_heads\": 64, \"seq_length\": 2048, \"max_position_embeddings\": 2048, \"pos_emb\": \"rotary\", \"no_weight_tying\": true, \"attention_config\": [\"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\", \"global\"], \"sparsity_config\": {}, \"bias_gelu_fusion\": true, \"rotary_pct\": 0.25, \"init_method\": \"small_init\", \"output_layer_init_method\": \"wang_init\", \"gpt_j_residual\": true, \"output_layer_parallelism\": \"column\", \"lr_decay_style\": \"constant\", \"min_lr\": 9.7e-08, \"override_lr_scheduler\": true, \"optimizer_type\": \"Adam\", \"zero_stage\": 1, \"zero_reduce_scatter\": true, \"zero_contiguous_gradients\": true, \"zero_reduce_bucket_size\": 1260000000, \"zero_allgather_bucket_size\": 1260000000, \"lr\": 9.7e-08, \"tokenizer_type\": \"HFTokenizer\", \"train_data_paths\": [\"/mnt/ssd-1/P3/P3_combined/train_text_document\"], \"test_data_paths\": [\"/mnt/ssd-1/P3/P3_combined/test_text_document\"], \"valid_data_paths\": [\"/mnt/ssd-1/P3/P3_combined/validation_text_document\"], \"train_data_weights\": [1.0], \"valid_data_weights\": [1.0], \"test_data_weights\": [1.0], \"data_impl\": \"mmap\", \"save\": \"/mnt/ssd-1/20B_P3\", \"config_files\": {\"20B_P3.yml\": \"# GPT-2 pretraining setup \n{\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages \n # across the node boundaries ) \n \\"pipe-parallel-size\\": 4, \n \\"model-parallel-size\\": 2, \n \n # model settings \n \\"num-layers\\": 44, \n \\"hidden-size\\": 6144, \n \\"num-attention-heads\\": 64, \n \\"seq-length\\": 2048, \n \\"max-position-embeddings\\": 2048, \n \\"norm\\": \\"layernorm\\", \n \\"pos-emb\\": \\"rotary\\",\n \\"rotary_pct\\": 0.25,\n \\"no-weight-tying\\": true,\n \\"gpt_j_residual\\": true, \n \\"output_layer_parallelism\\": \\"column\\",\n # these should provide some speedup but takes a while to build, set to true if desired \n \\"scaled-upper-triang-masked-softmax-fusion\\": false, \n \\"bias-gelu-fusion\\": true, \n\n # init methods\n \\"init_method\\": \\"small_init\\",\n \\"output_layer_init_method\\": \\"wang_init\\",\n \n \n # optimizer settings \n \\"optimizer\\": { \n \\"type\\": \\"Adam\\", \n \\"params\\": { \n \\"lr\\": 9.7e-8, \n \\"betas\\": [0.9, 0.95],\n \\"eps\\": 1.0e-8,\n }\n },\n \n \\"min_lr\\": 9.7e-8,\n \\"override-lr-scheduler\\": True,\n \\"zero_optimization\\": {\n \\"stage\\": 1,\n \\"allgather_partitions\\": True,\n \\"allgather_bucket_size\\": 1260000000,\n \\"overlap_comm\\": True,\n \\"reduce_scatter\\": True,\n \\"reduce_bucket_size\\": 1260000000,\n \\"contiguous_gradients\\": True,\n \\"cpu_offload\\": False\n },\n\n # batch / data settings (assuming 32 GPUs)\n \\"train_micro_batch_size_per_gpu\\": 4,\n \\"gradient_accumulation_steps\\": 16,\n \\"data-impl\\": \\"mmap\\",\n \n # activation checkpointing\n \\"checkpoint-activations\\": true,\n \\"checkpoint-num-layers\\": 1,\n \\"partition-activations\\": false,\n \\"synchronize-each-layer\\": true,\n\n # regularization\n \\"gradient_clipping\\": 1.0,\n \\"weight-decay\\": 0.01,\n \\"hidden-dropout\\": 0,\n \\"attention-dropout\\": 0,\n\n # precision settings\n \\"fp16\\": { \n \\"fp16\\": true,\n \\"enabled\\": true,\n \\"loss_scale\\": 0,\n \\"loss_scale_window\\": 1000,\n \\"initial_scale_power\\": 12,\n \\"hysteresis\\": 2,\n \\"min_loss_scale\\": 1\n },\n\n # misc. training settings\n \\"train-iters\\": 200000,\n \\"distributed-backend\\": \\"nccl\\",\n \\"lr-decay-style\\": \\"constant\\",\n \\"save-interval\\": 500,\n \\"eval-interval\\": 100,\n \\"eval-iters\\": 1,\n# \\"eval_tasks\\": [\\"lambada\\", \\"piqa\\", \\"hellaswag\\", \\"winogrande\\", \\"mathqa\\", \\"pubmedqa\\"],\n\n # logging\n \\"log-interval\\": 2,\n \\"steps_per_print\\": 2,\n \\"wall_clock_breakdown\\": false,\n\n ### NEW DATA: ####\n \\"tokenizer_type\\": \\"HFTokenizer\\",\n \\"vocab-file\\": \\"/mnt/ssd-1/data/20B_tokenizer.json\\",\n \\"train_data_paths\\": [\\"/mnt/ssd-1/P3/P3_combined/train_text_document\\"],\n \\"valid_data_paths\\": [\\"/mnt/ssd-1/P3/P3_combined/validation_text_document\\"],\n \\"test_data_paths\\": [\\"/mnt/ssd-1/P3/P3_combined/test_text_document\\"],\n \\"train_data_weights\\": [1.0], \\"valid_data_weights\\": [1.0], \\"test_data_weights\\": [1.0],\n\n \\"save\\": \\"/mnt/ssd-1/20B_P3\\",\n \\"load\\": \\"/mnt/ssd-1/20B_P3\\",\n \\"keep_last_n_checkpoints\\": 3,\n \\"tensorboard-dir\\": \\"/mnt/ssd-1/tensorboard\\",\n \\"log-dir\\": \\"/mnt/ssd-1/logs\\",\n \\"wandb_team\\": \\"eleutherai\\",\n \\"wandb_project\\": \\"gpt-thicc\\",\n \\"wandb_group\\": \\"P3\\"\n }\n\"}, \"load\": \"/mnt/ssd-1/20B_P3\", \"save_interval\": 500, \"batch_size\": 4, \"train_iters\": 200000, \"eval_iters\": 1, \"keep_last_n_checkpoints\": 3, \"eval_interval\": 100, \"vocab_file\": \"/mnt/ssd-1/data/20B_tokenizer.json\", \"attention_dropout\": 0, \"hidden_dropout\": 0, \"checkpoint_activations\": true, \"synchronize_each_layer\": true, \"gas\": 16, \"clip_grad\": 1.0, \"dynamic_loss_scale\": true, \"pipe_parallel_size\": 4, \"model_parallel_size\": 2, \"is_pipe_parallel\": true, \"wandb_group\": \"P3_2jmuwy18\", \"wandb_team\": \"eleutherai\", \"wandb_project\": \"gpt-thicc\", \"log_dir\": \"/mnt/ssd-1/logs\", \"tensorboard_dir\": \"/mnt/ssd-1/tensorboard\", \"log_interval\": 2, \"text_gen_type\": \"unconditional\", \"user_script\": \"train.py\", \"global_num_gpus\": 48}"
System Hardware
CPU count128
GPU count8
GPU typeNVIDIA A100-SXM4-40GB
W&B CLI Version
0.10.28
Config

Config parameters are your model's inputs. Learn more

  • {} 183 keys
    • "gelu"
    • false
    • 1,000
    • null
    • false
    • [] 44 items
      • 0
      • false
      • 4
      • false
      • true
      • false
      • true
      • false
      • 1
      • false
      • 1
      • {} 1 key
        • "# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 9.7e-8, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 9.7e-8, "override-lr-scheduler": True, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 32 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 16, "data-impl": "mmap", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 200000, "distributed-backend": "nccl", "lr-decay-style": "constant", "save-interval": 500, "eval-interval": 100, "eval-iters": 1, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "train_data_paths": ["/mnt/ssd-1/P3/P3_combined/train_text_document"], "valid_data_paths": ["/mnt/ssd-1/P3/P3_combined/validation_text_document"], "test_data_paths": ["/mnt/ssd-1/P3/P3_combined/test_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "save": "/mnt/ssd-1/20B_P3", "load": "/mnt/ssd-1/20B_P3", "keep_last_n_checkpoints": 3, "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "P3" } "
      • false
      • "mmap"
      • null
      • false
      • null
      • true
      • true
      • false
      • false
      • "nccl"
      • null
      • null
      • null
      • false
      • true
      • false
      • 100
      • 1
      • ""
      • null
      • null
      • null
      • false
      • null
      • {} 7 keys
        • false
        • false
        • 16
        • 46 ... 95
          96 ... 145
          146 ... 178
        • {} 8 keys
          • 1,260,000,000
          • true
          • 1
        Summary

        Summary metrics are your model's outputs. Learn more

        No summary metrics saved for this run.

        Check the summary metrics documentation for more information.