Eleutherai-oslo's workspace
Runs
480
State
Notes
User
Tags
Created
Runtime
Sweep
activation
adlr_autoresume
adlr_autoresume_interval
apply_query_key_layer_scaling
attention_config
attention_dropout
attention_softmax_in_fp32
batch_size
bias_dropout_fusion
bias_gelu_fusion
char_level_ppl
checkpoint_activations
checkpoint_in_cpu
checkpoint_num_layers
checkpoint_validation_with_forward_pass
clip_grad
config_files.1B_ko.yml
contiguous_checkpointing
data_impl
data_path
deepscale
deepspeed
deepspeed_activation_checkpointing
deepspeed_mpi
detect_nvlink_pairs
distributed_backend
dump_state
dynamic_loss_scale
eod_mask_loss
eval_interval
eval_iters
eval_results_prefix
eval_tasks
eval_tasks_interval
finetune
fp16.enabled
fp16.fp16
fp16.hysteresis
fp16.initial_scale_power
fp16.loss_scale
fp16.loss_scale_window
fp16.min_loss_scale
fp16_lm_cross_entropy
fp32_allreduce
Crashed
eleutherai-oslo
12d 20h 4m 2s
-
gelu
false
1000
false
global
0
true
4
false
false
false
true
false
1
false
1
-
false
mmap
/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_6b_text_document
false
true
true
true
false
nccl
false
true
false
1000
10
nsmc
50000000000
false
true
true
2
32
-
1000
1
false
false
Crashed
eleutherai-oslo
10d 14h 2m 38s
-
gelu
false
1000
false
global
0
true
8
false
true
false
true
false
1
false
1
-
false
mmap
/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_3b_text_document
false
true
true
true
false
nccl
false
true
false
1000
10
nsmc
50000000000
false
true
true
2
32
-
1000
1
false
false
Crashed
eleutherai-oslo
4d 19h 46m 23s
-
gelu
false
1000
false
global
0
true
4
false
true
false
true
false
1
false
1
["# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"./tokenizer/MBBPE/tokenizer.json\",\n \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n\n # wandb config\n \"wandb_team\": \"eleutherai-oslo\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"is_pipe_parallel\": false,\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 24,\n \"hidden-size\": 2048,\n \"num-attention-heads\": 16,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n \"rotary_ndims\": 64,\n \"gpt_j_residual\": true,\n \"output_layer_parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": true,\n\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 2.0e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 1,\n \"data-impl\": \"mmap\",\n \"split\": \"949,50,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"attention_softmax_in_fp32\": true,\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 32,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 320000,\n \"lr-decay-iters\": 320000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 500,\n \"eval-interval\": 1000,\n \"eval-tasks-interval\": 50000000000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 4,\n \"wall_clock_breakdown\": true,\n\n # wandb\n \"use_wandb\": true,\n \"wandb_project\": \"gpt-neox-ko-1b\",\n \"eval_tasks\": [\"nsmc\"],\n\n # deepspeed launcher\n \"launcher\": \"openmpi\",\n \"deepspeed_mpi\": true\n}\n","# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"./tokenizer/MBBPE/tokenizer.json\",\n \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n\n # wandb config\n \"wandb_team\": \"eleutherai-oslo\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"is_pipe_parallel\": false,\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 24,\n \"hidden-size\": 2048,\n \"num-attention-heads\": 16,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n \"rotary_ndims\": 64,\n \"gpt_j_residual\": true,\n \"output_layer_parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": true,\n\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 2.0e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 1,\n \"data-impl\": \"mmap\",\n \"split\": \"949,50,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"attention_softmax_in_fp32\": true,\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 32,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 320000,\n \"lr-decay-iters\": 320000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 5000,\n \"eval-interval\": 1000,\n \"eval-tasks-interval\": 50000000000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 4,\n \"wall_clock_breakdown\": true,\n\n # wandb\n \"use_wandb\": true,\n \"wandb_project\": \"gpt-neox-ko-1b\",\n \"eval_tasks\": [\"nsmc\"],\n\n # deepspeed launcher\n \"launcher\": \"openmpi\",\n \"deepspeed_mpi\": true\n}\n","# Same as GPT-Neo 1.3B\n{\n # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in\n \"tokenizer-type\": \"HFTokenizer\",\n \"vocab-file\": \"./tokenizer/MBBPE/tokenizer.json\",\n \"save\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/1B_scratch\",\n # \"load\": \"/fsx/multi-lingual-6b/gpt-neox/checkpoints/6B_8\",\n\n # wandb config\n \"wandb_team\": \"eleutherai-oslo\",\n\n # If finetuning, edit the following to the location of your finetuning dataset:\n \"data-path\": \"/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document\",\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n \"is_pipe_parallel\": false,\n \"pipe-parallel-size\": 1,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 24,\n \"hidden-size\": 2048,\n \"num-attention-heads\": 16,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"norm\": \"layernorm\",\n \"pos-emb\": \"rotary\",\n \"no-weight-tying\": true,\n \"rotary_ndims\": 64,\n \"gpt_j_residual\": true,\n \"output_layer_parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": true,\n\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 2.0e-4,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n \"cpu_offload\": False\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 4,\n \"gradient_accumulation_steps\": 1,\n \"data-impl\": \"mmap\",\n \"split\": \"949,50,1\",\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.01,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"attention_softmax_in_fp32\": true,\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"initial_scale_power\": 32,\n \"loss_scale_window\": 1000,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1\n },\n\n # misc. training settings\n \"train-iters\": 320000,\n \"lr-decay-iters\": 320000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"save-interval\": 5000,\n \"eval-interval\": 1000,\n \"eval-tasks-interval\": 50000000000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 100,\n \"steps_per_print\": 10,\n \"keep-last-n-checkpoints\": 4,\n \"wall_clock_breakdown\": true,\n\n # wandb\n \"use_wandb\": true,\n \"wandb_project\": \"gpt-neox-ko-1b\",\n \"eval_tasks\": [\"nsmc\"],\n\n # deepspeed launcher\n \"launcher\": \"openmpi\",\n \"deepspeed_mpi\": true\n}\n"]
false
mmap
/fsx/multi-lingual-6b/gpt-neox/processed/multi_ko_1b_text_document
false
true
true
true
false
nccl
false
true
false
1000
10
nsmc
50000000000
false
true
true
2
32
0
1000
1
false
false
1-3
of 3