quentin-anthony

Quentin-anthony's workspace

Runs

Crashed

quentin-anthony

9mo ago

gelu

false

1000

true

false

["global","global","global","global","global","global","global","global","global","global","global","global"]

false

true

10000

false

linear

false

# GPT-2 pretraining setup { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) "pipe_parallel_size": 1, "model_parallel_size": 1, # model settings "num_layers": 12, "hidden_size": 768, "num_attention_heads": 12, "seq_length": 2048, "max_position_embeddings": 2048, "norm": "layernorm", "pos_emb": "rotary", "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", # these should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": false, "bias_gelu_fusion": false, "rope_fusion": false, "layernorm_fusion": false, # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.0006, "betas": [0.9, 0.95], "eps": 1.0e-8, } }, "min_lr": 0.00006, # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, "contiguous_gradients": True, }, # batch / data settings "train_micro_batch_size_per_gpu": 4, "data_impl": "mmap", # activation checkpointing "checkpoint_activations": true, "checkpoint_num_layers": 1, "partition_activations": true, "synchronize_each_layer": true, # regularization "gradient_clipping": 1.0, "weight_decay": 0.1, "hidden_dropout": 0.0, "attention_dropout": 0.0, # precision settings "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1 }, # misc. training settings "train_iters": 320000, "lr_decay_iters": 320000, "distributed_backend": "nccl", "lr_decay_style": "cosine", "warmup": 0.01, "checkpoint_factor": 10000, "eval_interval": 1000, "eval_iters": 10, #"train_epochs": 3, # logging "log_interval": 1, "steps_per_print": 1, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, # networking "hostfile": "/mock_path" }

# Suggested data paths when using GPT-NeoX locally { "data_path": "data/enwik8/enwik8_text_document", # or for weighted datasets: # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], # "train-data-weights": [1., 2.], # "test-data-weights": [2., 1.], # "valid-data-weights": [0.5, 0.4], # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. # WARNING: setting this to True will override any user provided weights # "weight_by_num_documents": false, # "weighted_sampler_alpha": 0.3, "vocab_file": "data/gpt2-vocab.json", "merge_file": "data/gpt2-merges.txt", "save": "checkpoints", "load": "checkpoints", "checkpoint_validation_with_forward_pass": False, "tensorboard_dir": "tensorboard", "log_dir": "logs", "use_wandb": True, "wandb_host": "https://api.wandb.ai", "wandb_project": "pr_test", "wandb_run_name": "experiment_test", "peak_theoretical_tflops": 100, }

false

true

mmap

data/enwik8/enwik8_text_document

gpt2

false

true

false

nccl

0.1

true

false

true

false

1000

1-1

of 1