eleutherai

Eleutherai-oslo's workspace

Runs

480

192

160

128

Crashed

eleutherai-oslo

3y ago

12d 20h 4m 2s

gelu

false

1000

false

global

true

false

true

false

mmap

/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_6b_text_document

false

true

false

nccl

false

true

false

1000

nsmc

50000000000

false

true

1000

false

Crashed

eleutherai-oslo

3y ago

10d 14h 2m 38s

gelu

false

1000

false

global

true

false

true

false

true

false

mmap

/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_3b_text_document

false

true

false

nccl

false

true

false

1000

nsmc

50000000000

false

true

1000

false

Crashed

eleutherai-oslo

3y ago

4d 19h 46m 23s

gelu

false

1000

false

global

true

false

true

false

true

false

["# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"./tokenizer/MBBPE/tokenizer.json\",\n \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n\n # wandb config\n \"wandb_team\": \"eleutherai-oslo\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"is_pipe_parallel\": false,\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 24,\n \"hidden-size\": 2048,\n \"num-attention-heads\": 16,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n \"rotary_ndims\": 64,\n \"gpt_j_residual\": true,\n \"output_layer_parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": true,\n\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 2.0e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 1,\n \"data-impl\": \"mmap\",\n \"split\": \"949,50,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"attention_softmax_in_fp32\": true,\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 32,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 320000,\n \"lr-decay-iters\": 320000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 500,\n \"eval-interval\": 1000,\n \"eval-tasks-interval\": 50000000000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 4,\n \"wall_clock_breakdown\": true,\n\n # wandb\n \"use_wandb\": true,\n \"wandb_project\": \"gpt-neox-ko-1b\",\n \"eval_tasks\": [\"nsmc\"],\n\n # deepspeed launcher\n \"launcher\": \"openmpi\",\n \"deepspeed_mpi\": true\n}\n","# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"./tokenizer/MBBPE/tokenizer.json\",\n \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n\n # wandb config\n \"wandb_team\": \"eleutherai-oslo\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"is_pipe_parallel\": false,\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 24,\n \"hidden-size\": 2048,\n \"num-attention-heads\": 16,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n \"rotary_ndims\": 64,\n \"gpt_j_residual\": true,\n \"output_layer_parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": true,\n\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 2.0e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 1,\n \"data-impl\": \"mmap\",\n \"split\": \"949,50,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"attention_softmax_in_fp32\": true,\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 32,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 320000,\n \"lr-decay-iters\": 320000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 5000,\n \"eval-interval\": 1000,\n \"eval-tasks-interval\": 50000000000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 4,\n \"wall_clock_breakdown\": true,\n\n # wandb\n \"use_wandb\": true,\n \"wandb_project\": \"gpt-neox-ko-1b\",\n \"eval_tasks\": [\"nsmc\"],\n\n # deepspeed launcher\n \"launcher\": \"openmpi\",\n \"deepspeed_mpi\": true\n}\n","# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"./tokenizer/MBBPE/tokenizer.json\",\n \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n # \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/6B_8\",\n\n # wandb config\n \"wandb_team\": \"eleutherai-oslo\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"is_pipe_parallel\": false,\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 24,\n \"hidden-size\": 2048,\n \"num-attention-heads\": 16,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n \"rotary_ndims\": 64,\n \"gpt_j_residual\": true,\n \"output_layer_parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": true,\n\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 2.0e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 1,\n \"data-impl\": \"mmap\",\n \"split\": \"949,50,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"attention_softmax_in_fp32\": true,\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 32,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 320000,\n \"lr-decay-iters\": 320000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 5000,\n \"eval-interval\": 1000,\n \"eval-tasks-interval\": 50000000000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 4,\n \"wall_clock_breakdown\": true,\n\n # wandb\n \"use_wandb\": true,\n \"wandb_project\": \"gpt-neox-ko-1b\",\n \"eval_tasks\": [\"nsmc\"],\n\n # deepspeed launcher\n \"launcher\": \"openmpi\",\n \"deepspeed_mpi\": true\n}\n"]

false

mmap

/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document

false

true

false

nccl

false

true

false

1000

nsmc

50000000000

false

true

1000

false

1-3

of 3