Skip to main content

Chilli's group workspace

example

What makes this group special?
Tags

floral-firefly-11599

Notes
State
Failed
Start time
April 4th, 2025 6:18:40 AM
Runtime
38s
Tracked hours
37s
Run path
eleutherai/neox/5rat5lum
OS
Linux-6.5.13-65-650-4141-22041-coreweave-amd64-85c45edc-x86_64-with-glibc2.35
Python version
CPython 3.10.14
Git repository
git clone git@github.com:EleutherAI/gpt-neox.git
Git state
git checkout -b "floral-firefly-11599" f84b54e20361772cf97da87ba48960b741954065
Command
/mnt/ssd-1/nora/gpt-neox/train.py --local_rank=0 --deepspeed_config eyJ0cmFpbl9iYXRjaF9zaXplIjogMzIsICJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHUiOiA0LCAib3B0aW1pemVyIjogeyJ0eXBlIjogIkFkYW0iLCAicGFyYW1zIjogeyJsciI6IDAuMDAwNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZW5hYmxlZCI6IHRydWUsICJsb3NzX3NjYWxlIjogMCwgImxvc3Nfc2NhbGVfd2luZG93IjogMTAwMCwgImh5c3RlcmVzaXMiOiAyLCAibWluX2xvc3Nfc2NhbGUiOiAxfSwgInplcm9fb3B0aW1pemF0aW9uIjogeyJzdGFnZSI6IDEsICJhbGxnYXRoZXJfcGFydGl0aW9ucyI6IHRydWUsICJhbGxnYXRoZXJfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJvdmVybGFwX2NvbW0iOiB0cnVlLCAicmVkdWNlX3NjYXR0ZXIiOiB0cnVlLCAicmVkdWNlX2J1Y2tldF9zaXplIjogNTAwMDAwMDAwLCAiY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlfSwgIndhbGxfY2xvY2tfYnJlYWtkb3duIjogdHJ1ZX0= --megatron_config {"hostfile": "/mock_path", "train_batch_size": 32, "train_micro_batch_size_per_gpu": 4, "optimizer": {"type": "Adam", "params": {"lr": 0.0006, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true}, "wall_clock_breakdown": true, "precision": "fp16", "num_layers": 12, "hidden_size": 768, "num_attention_heads": 12, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "init_method": "small_init", "output_layer_init_method": "wang_init", "lr_decay_style": "cosine", "lr_decay_iters": 320000, "min_lr": 6e-05, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.0006, "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document", "data_impl": "mmap", "save": "/mnt/ssd-1/checkpoints", "config_files": {"125M-moe.yml": "# GPT-2 pretraining setup\n{\n   # See README for MoE config docs!\n   \"moe_type\": \"deepspeed\",\n   \"moe_token_dropping\": true,\n   # Have 4 experts per layer (every 2 layers by default)\n   \"moe_num_experts\": 4,\n   # parallelism settings\n   \"enable_expert_tensor_parallelism\": true,\n   \"pipe_parallel_size\": 1, # not yet supported for MoE\n   \"model_parallel_size\": 1,\n   \"moe_expert_parallel_size\": 1,\n\n   # model settings\n   \"num_layers\": 12,\n   \"hidden_size\": 768,\n   \"num_attention_heads\": 12,\n   \"seq_length\": 2048,\n   \"max_position_embeddings\": 2048,\n   \"norm\": \"layernorm\",\n   \"pos_emb\": \"rotary\",\n   \"no_weight_tying\": true,\n   \"gpt_j_residual\": false,\n   \"output_layer_parallelism\": \"column\",\n\n   # these should provide some speedup but takes a while to build, set to true if desired\n   \"scaled_upper_triang_masked_softmax_fusion\": false,\n   \"bias_gelu_fusion\": false,\n   \"rope_fusion\": false,\n\n   # init methods\n   \"init_method\": \"small_init\",\n   \"output_layer_init_method\": \"wang_init\",\n\n\n   # optimizer settings\n   \"optimizer\": {\n     \"type\": \"Adam\",\n     \"params\": {\n       \"lr\": 0.0006,\n       \"betas\": [0.9, 0.95],\n       \"eps\": 1.0e-8,\n     }\n   },\n   \"min_lr\": 0.00006,\n\n   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n   \"zero_optimization\": {\n    \"stage\": 1,\n    \"allgather_partitions\": True,\n    \"allgather_bucket_size\": 500000000,\n    \"overlap_comm\": True,\n    \"reduce_scatter\": True,\n    \"reduce_bucket_size\": 500000000,\n    \"contiguous_gradients\": True,\n  },\n\n   # batch / data settings\n   \"train_micro_batch_size_per_gpu\": 4,\n   \"data_impl\": \"mmap\",\n\n   # activation checkpointing\n   \"checkpoint_activations\": true,\n   \"checkpoint_num_layers\": 1,\n   \"partition_activations\": true,\n   \"synchronize_each_layer\": true,\n\n   # regularization\n   \"gradient_clipping\": 1.0,\n   \"weight_decay\": 0.1,\n   \"hidden_dropout\": 0.0,\n   \"attention_dropout\": 0.0,\n\n   # precision settings\n   \"fp16\": {\n     \"enabled\": true,\n     \"loss_scale\": 0,\n     \"loss_scale_window\": 1000,\n     \"hysteresis\": 2,\n     \"min_loss_scale\": 1\n   },\n\n   # misc. training settings\n   \"train_iters\": 320000,\n   \"lr_decay_iters\": 320000,\n   \"distributed_backend\": \"nccl\",\n   \"lr_decay_style\": \"cosine\",\n   \"warmup\": 0.01,\n   \"checkpoint_factor\": 10000,\n   \"eval_interval\": 1000,\n   \"eval_iters\": 10,\n\n   # logging\n   \"log_interval\": 10,\n   \"steps_per_print\": 10,\n   \"keep_last_n_checkpoints\": 4,\n   \"wall_clock_breakdown\": true,\n\n  #  networking\n  \"hostfile\": \"/mock_path\"\n}\n", "eleutherai_cluster.yml": "# Data paths and options when using EleutherAI cluster\n{\n  # you may include multiple distinct datasets if desired\n  #\"train_data_paths\": [\"/mnt/ssd-1/data/enwik8/enwik8_text_document\"],\n  #\"valid_data_paths\": [\"/mnt/ssd-1/data/enwik8/enwik8_val_text_document\"],\n  #\"test_data_paths\": [\"/mnt/ssd-1/data/enwik8/enwik8_test_text_document\"],\n\n  # if using multiple datasets, provide weights for them to be sampled with\n  # \"train-data-weights\": [1., 2.],\n  # \"test-data-weights\": [2., 1.],\n  # \"valid-data-weights\": [0.5, 0.4],\n\n\n  # If you would like the code to create val and test datasets from your training set use the following instead\n  # \"split\" determines the relative size of train, val, and test\n\n  \"split\": \"995,4,1\",\n  \"data_path\": \"/mnt/ssd-1/data/enwik8/enwik8_text_document\",\n\n  \"vocab_file\": \"/mnt/ssd-1/data/gpt2-vocab.json\",\n  \"merge_file\": \"/mnt/ssd-1/data/gpt2-merges.txt\",\n  \"save\": \"/mnt/ssd-1/checkpoints\",\n  \"load\": \"/mnt/ssd-1/checkpoints\",\n  \"tensorboard_dir\": \"/mnt/ssd-1/tensorboard\",\n  \"log_dir\": \"/mnt/ssd-1/logs\",\n  \"wandb_team\": \"eleutherai\",\n  #\"wandb_run_name\": \"experiment\"\n  \"wandb_project\": \"neox\",\n  \"wandb_group\": \"example\"\n}\n"}, "load": "/mnt/ssd-1/checkpoints", "checkpoint_factor": 10000, "batch_size": 4, "train_iters": 320000, "eval_iters": 10, "keep_last_n_checkpoints": 4, "split": "995,4,1", "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json", "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt", "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "dynamic_loss_scale": true, "pipe_parallel_size": 1, "world_size": 1, "wandb_group": "example", "wandb_team": "eleutherai", "log_dir": "/mnt/ssd-1/logs", "tensorboard_dir": "/mnt/ssd-1/tensorboard", "log_interval": 10, "text_gen_type": "unconditional", "moe_num_experts": 4, "moe_token_dropping": true, "moe_type": "deepspeed", "enable_expert_tensor_parallelism": true, "local_rank": 0, "rank": 0, "user_script": "train.py", "global_num_gpus": 8}
System Hardware
CPU count48
Logical CPU count 96
GPU count8
GPU typeNVIDIA A40
W&B CLI Version
0.19.8
Group
example
Config

Config parameters are your model's inputs. Learn more

  • {} 327 keys
    • null
    • "gelu"
    • null
    • false
    • 1,000
    • true
    • null
    • false
    • [] 12 items
      • 0
      • false
      • null
      • null
      • null
      • 4
      • null
      • false
      • false
      • false
      • null
      • true
      • 10,000
      • false
      • 1
      • "linear"
      • false
      • 1
      • null
      • null
      • null
      • null
      • null
      • null
      • null
      • null
      • null
      • null
      • {} 2 keys
        • "# GPT-2 pretraining setup { # See README for MoE config docs! "moe_type": "deepspeed", "moe_token_dropping": true, # Have 4 experts per layer (every 2 layers by default) "moe_num_experts": 4, # parallelism settings "enable_expert_tensor_parallelism": true, "pipe_parallel_size": 1, # not yet supported for MoE "model_parallel_size": 1, "moe_expert_parallel_size": 1, # model settings "num_layers": 12, "hidden_size": 768, "num_attention_heads": 12, "seq_length": 2048, "max_position_embeddings": 2048, "norm": "layernorm", "pos_emb": "rotary", "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": false, "bias_gelu_fusion": false, "rope_fusion": false, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.0006, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.00006, # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, "contiguous_gradients": True, }, # batch / data settings "train_micro_batch_size_per_gpu": 4, "data_impl": "mmap", # activation checkpointing "checkpoint_activations": true, "checkpoint_num_layers": 1, "partition_activations": true, "synchronize_each_layer": true, # regularization "gradient_clipping": 1.0, "weight_decay": 0.1, "hidden_dropout": 0.0, "attention_dropout": 0.0, # precision settings "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train_iters": 320000, "lr_decay_iters": 320000, "distributed_backend": "nccl", "lr_decay_style": "cosine", "warmup": 0.01, "checkpoint_factor": 10000, "eval_interval": 1000, "eval_iters": 10, # logging "log_interval": 10, "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, # networking "hostfile": "/mock_path" } "
        • "# Data paths and options when using EleutherAI cluster { # you may include multiple distinct datasets if desired #"train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"], #"valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"], #"test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"], # if using multiple datasets, provide weights for them to be sampled with # "train-data-weights": [1., 2.], # "test-data-weights": [2., 1.], # "valid-data-weights": [0.5, 0.4], # If you would like the code to create val and test datasets from your training set use the following instead # "split" determines the relative size of train, val, and test "split": "995,4,1", "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document", "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json", "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt", "save": "/mnt/ssd-1/checkpoints", "load": "/mnt/ssd-1/checkpoints", "tensorboard_dir": "/mnt/ssd-1/tensorboard", "log_dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", #"wandb_run_name": "experiment" "wandb_project": "neox", "wandb_group": "example" } "
      • false
      • false
      • true
      • null
      • null
      • 0
      • null
      • "mmap"
      • 46 ... 95
        96 ... 145
        146 ... 195
        196 ... 245
        246 ... 295
        296 ... 322
      • {} 7 keys
        • 500,000,000
        • true
        • 1
      Summary

      Summary metrics are your model's outputs. Learn more

      • {} 0 keys