eleutherai

Finished

stellaathena

4y ago

1d 1m 11s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 9.7e-8, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 9.7e-8, "override-lr-scheduler": True, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 32 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 16, "data-impl": "mmap", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 200000, "distributed-backend": "nccl", "lr-decay-style": "constant", "save-interval": 500, "eval-interval": 100, "eval-iters": 1, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "train_data_paths": ["/mnt/ssd-1/P3/P3_combined/train_text_document"], "valid_data_paths": ["/mnt/ssd-1/P3/P3_combined/validation_text_document"], "test_data_paths": ["/mnt/ssd-1/P3/P3_combined/test_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "save": "/mnt/ssd-1/20B_P3", "load": "/mnt/ssd-1/20B_P3", "keep_last_n_checkpoints": 3, "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "P3" }

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Finished

stellaathena

4y ago

11h 19m 19s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 9.7e-8, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 9.7e-8, "override-lr-scheduler": True, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 32 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 16, "data-impl": "mmap", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 200000, "distributed-backend": "nccl", "lr-decay-style": "constant", "save-interval": 500, "eval-interval": 100, "eval-iters": 1, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "train_data_paths": ["/mnt/ssd-1/P3/P3_combined/train_text_document"], "valid_data_paths": ["/mnt/ssd-1/P3/P3_combined/validation_text_document"], "test_data_paths": ["/mnt/ssd-1/P3/P3_combined/test_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "save": "/mnt/ssd-1/20B_P3", "load": "/mnt/ssd-1/20B_P3", "keep_last_n_checkpoints": 3, "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "P3" }

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Finished

stellaathena

4y ago

1d 4h 35m 59s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 9.7e-8, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 9.7e-8, "override-lr-scheduler": True, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 32 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 16, "data-impl": "mmap", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 200000, "distributed-backend": "nccl", "lr-decay-style": "constant", "save-interval": 1000, "eval-interval": 10000, "eval-iters": 10, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "train_data_paths": ["/mnt/ssd-1/P3/P3_combined/train_text_document"], "valid_data_paths": ["/mnt/ssd-1/P3/P3_combined/validation_text_document"], "test_data_paths": ["/mnt/ssd-1/P3/P3_combined/test_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "save": "/mnt/ssd-1/20B_P3", "load": "/mnt/ssd-1/20B_checkpoints", "keep_last_n_checkpoints": 3, "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "P3" }

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Finished

jmerizia-vt-edu

4y ago

2h 25m 41s

-

gelu

false

1000

false

-

global

0

false

2

-

false

true

false

-

true

-

false

1

-

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 9.7e-5, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 9.7e-5, "use-checkpoint-lr-scheduler": True, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 32 GPUs) "train_micro_batch_size_per_gpu": 2, "gradient_accumulation_steps": 16, "data-impl": "mmap", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.05, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 200000, "distributed-backend": "nccl", "lr-decay-style": "constant", "save-interval": 500, "eval-interval": 50, "eval-iters": 10, "keep_last_n_checkpoints": 2, # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "train_data_paths": ["/mnt/ssd-1/data/APPS/APPS-TRAIN-QAAA_text_document"], "valid_data_paths": ["/mnt/ssd-1/data/APPS/APPS-TEST-QAAA_text_document"], "test_data_paths": ["/mnt/ssd-1/data/APPS/APPS-TEST-QAAA_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "save": "/mnt/ssd-1/20B_APPS", "load": "/mnt/ssd-1/20B_APPS", "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "codetune", }

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Finished

stellaathena

4y ago

2d 4h 52m 15s

-

gelu

false

1000

false

-

global

0

false

2

-

false

true

false

-

true

-

false

1

-

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 9.7e-5, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 9.7e-5, "use-checkpoint-lr-scheduler": True, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 32 GPUs) "train_micro_batch_size_per_gpu": 2, "gradient_accumulation_steps": 16, "data-impl": "mmap", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.05, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 200000, "distributed-backend": "nccl", "lr-decay-style": "constant", "save-interval": 1000, "eval-interval": 12000, "eval-iters": 10, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "train_data_paths": ["/mnt/ssd-1/data/MATH/MATH_train_text_document"], "valid_data_paths": ["/mnt/ssd-1/data/MATH/MATH_test_text_document"], "test_data_paths": ["/mnt/ssd-1/data/MATH/MATH_test_text_document"], "train_data_weights": [1.0], "valid_data_weights": [1.0], "test_data_weights": [1.0], "save": "/mnt/ssd-1/20B_math", "load": "/mnt/ssd-1/20B_math", "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "math" }

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Crashed

sdtblck

4y ago

18mo 29d 20h 15m 9s

-

gelu

false

1000

false

-

global

0

false

1.42857

-

false

true

false

-

[false,true]

500

false

1

linear

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": true, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.97e-4, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.97e-5, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 96 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 32, "data-impl": "mmap", "split": "995,4,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 150, "lr-decay-iters": 150, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "checkpoint_factor": 500, "eval-interval": 1000, "eval-iters": 10, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/fsx/quentin/jacob/gpt-neox-stuff/gpt-neox/data/test20B_tokenizer.json", #"data-path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", "data-path": "/fsx/quentin/jacob/gpt-neox-stuff/gpt-neox/data/enwik8/enwik8_text_document", "save": "20B_checkpoints", "load": "20B_checkpoints", "tensorboard-dir": "tensorboard", "log-dir": "logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", }

false

0

-

mmap

["/fsx/quentin/jacob/gpt-neox-stuff/gpt-neox/data/enwik8/enwik8_text_document","/mnt/ssd-1/data/pile_filtered_tokenized/pile_filtered_text_document"]

false

true

false

-

nccl

false

Failed

stellaathena

4y ago

30s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": false, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.97e-4, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.97e-5, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 96 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 32, "data-impl": "mmap", "split": "995,4,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.01, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 150000, "lr-decay-iters": 150000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "save-interval": 500, "eval-interval": 1000, "eval-iters": 10, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "data-path": "/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", "save": "/mnt/ssd-cluster/20B_checkpoints", "load": "/mnt/ssd-cluster/20B_checkpoints", "tensorboard-dir": "/mnt/ssd-cluster/tensorboard", "log-dir": "/mnt/ssd-cluster/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", "wandb_group": "A40-test" }

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

-

nccl

false

Finished

stellaathena

4y ago

4d 8h 17m 37s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe-parallel-size": 4, "model-parallel-size": 2, "override_lr_scheduler": true, # model settings "num-layers": 44, "hidden-size": 6144, "num-attention-heads": 64, "seq-length": 2048, "max-position-embeddings": 2048, "norm": "layernorm", "pos-emb": "rotary", "rotary_pct": 0.25, "no-weight-tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled-upper-triang-masked-softmax-fusion": true, "bias-gelu-fusion": true, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.97e-4, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 1.05e-5, "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 1260000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 1260000000, "contiguous_gradients": True, "cpu_offload": False }, # batch / data settings (assuming 96 GPUs) "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 32, "data-impl": "mmap", "split": "995,4,1", # activation checkpointing "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": false, "synchronize-each-layer": true, # regularization "gradient_clipping": 1.0, "weight-decay": 0.1, "hidden-dropout": 0, "attention-dropout": 0, # precision settings "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train-iters": 150000, "lr-decay-iters": 150000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "save-interval": 500, "eval-interval": 1000, "eval-iters": 10, # "eval_tasks": ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"], # logging "log-interval": 2, "steps_per_print": 2, "wall_clock_breakdown": false, ### NEW DATA: #### "tokenizer_type": "HFTokenizer", "vocab-file": "/mnt/ssd-1/data/20B_tokenizer.json", "data-path": "/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", "save": "/mnt/ssd-1/20B_fork_checkpoints", "load": "/mnt/ssd-1/20B_checkpoints", "tensorboard-dir": "/mnt/ssd-1/tensorboard", "log-dir": "/mnt/ssd-1/logs", "wandb_team": "eleutherai", "wandb_project": "gpt-thicc", }

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

-

nccl

false

Crashed

stellaathena

4y ago

2d 20h 59m

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Crashed

stellaathena

4y ago

2d 4h 42m 14s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

false

-

mmap

-

false

true

false

-

false

-

nccl

false

Crashed

sdtblck

4y ago

1mo 19d 5h 28m 57s

-

0

-

true

-

true

1

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

-

nccl

-

Crashed

sdtblck

4y ago

2mo 21d 16h 57m 14s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

-

nccl

false

Finished

quentin-anthony

4y ago

1h 7s

-

gelu

false

1000

false

-

global

0

false

8

-

false

true

false

-

false

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

-

nccl

false

Finished

quentin-anthony

4y ago

59m 46s

-

gelu

false

1000

false

-

global

0

false

8

-

false

true

false

-

false

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

-

nccl

false

Crashed

sdtblck

4y ago

17d 4h 29m 10s

-

gelu

false

1000

false

0

global

0

false

8

true

false

true

false

true

1

false

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

nccl

false

Crashed

quentin-anthony

4y ago

13h 15m 31s

-

gelu

false

1000

false

-

global

0

false

8

-

false

true

false

-

false

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document

false

true

false

-

false

-

nccl

false

Finished

sdtblck

4y ago

20s

-

Finished

sdtblck

4y ago

8m 45s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_filtered_tokenized/pile_filtered_text_document

false

true

false

-

false

-

nccl

false

Failed

sdtblck

4y ago

17s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_filtered_tokenized/pile_filtered_text_document

false

true

false

-

false

-

nccl

false

Failed

sdtblck

4y ago

43s

-

gelu

false

1000

false

-

global

0

false

4

-

false

true

false

-

true

-

false

1

-

false

1

-

false

-

mmap

/mnt/ssd-1/data/pile_filtered_tokenized/pile_filtered_text_document

false

true

false

-

false

-

nccl

false

Aran's workspace