Schoelkopf's group workspace
Group: mamba-tp1-bs16-fusedinner_r7kwr8b2_11cu2i6r
State
Notes
User
Tags
Created
Runtime
Sweep
activation
adlr_autoresume
adlr_autoresume_interval
apply_query_key_layer_scaling
attention_config
attention_dropout
attention_softmax_in_fp32
batch_size
bias_dropout_fusion
bias_gelu_fusion
char_level_ppl
checkpoint_activations
checkpoint_factor
checkpoint_in_cpu
checkpoint_num_layers
checkpoint_scale
checkpoint_validation_with_forward_pass
clip_grad
config_files.mamba-160m.yml
contiguous_checkpointing
coord_check
create_moe_param_group
curriculum_seqlen
data_impl
deepscale
deepspeed
deepspeed_activation_checkpointing
deepspeed_mpi
deepspeed_slurm
detect_nvlink_pairs
distributed_backend
dump_state
dynamic_loss_scale
eod_mask_loss
eval_interval
eval_iters
eval_results_prefix
expert_interval
finetune
force_multi
fp16.enabled
fp16.fp16
fp16.hysteresis
fp16.initial_scale_power
Crashed
-
schoelkopf
13m 30s
-
silu
false
1000
false
["mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba"]
0
false
16
false
false
false
true
250
false
1
linear
false
1
{
"pipe_parallel_size": 0,
"model_parallel_size": 1,
"num_layers": 24,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",
"attention_config": [[["mamba"], 24]],
# "scaled_upper_triang_masked_softmax_fusion": true,
# "bias_gelu_fusion": true,
"mamba_selective_scan_fusion": true,
"mamba_causal_conv_fusion": true,
"mamba_inner_func_fusion": true,
"mamba_selective_fp32_params": true,
"activation": "silu",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-5,
"output_layer_init_method": "single_residual_scaled_normal",
# "init_method": "small_init",
# "output_layer_init_method": "wang_init",
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00006,
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},
"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 2,
"data_impl": "mmap",
"num_workers": 1,
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,
"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},
"train_iters": 143001,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 250,
# "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
"eval_interval": 143000,
"eval_iters": 10,
"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,
"tokenizer_type": "HFTokenizer",
"vocab_file": "/weka/pile/20B_tokenizer.json",
# "save": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "load": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "s3_path": "s3://s-eai-neox-west/hailey/mamba/test-ckpts/mamba-160m-pythia-test-conv-bias",
# "keep_last_n_checkpoints": 2,
"train_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"valid_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"test_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"launcher": "slurm",
"deepspeed_slurm": true,
# "account": "eleuther",
"no_ssh_check": true,
"use_wandb": true,
"wandb_group": "mamba-tp1-bs16-fusedinner",
"wandb_team": "eleutherai",
"wandb_project": "mamba-neox-tp-memsavings",
}
false
false
true
0
mmap
false
true
true
false
true
false
nccl
false
true
false
143000
10
2
false
false
true
true
2
12
Crashed
-
schoelkopf
13m 30s
-
silu
false
1000
false
["mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba"]
0
false
16
false
false
false
true
250
false
1
linear
false
1
{
"pipe_parallel_size": 0,
"model_parallel_size": 1,
"num_layers": 24,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",
"attention_config": [[["mamba"], 24]],
# "scaled_upper_triang_masked_softmax_fusion": true,
# "bias_gelu_fusion": true,
"mamba_selective_scan_fusion": true,
"mamba_causal_conv_fusion": true,
"mamba_inner_func_fusion": true,
"mamba_selective_fp32_params": true,
"activation": "silu",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-5,
"output_layer_init_method": "single_residual_scaled_normal",
# "init_method": "small_init",
# "output_layer_init_method": "wang_init",
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00006,
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},
"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 2,
"data_impl": "mmap",
"num_workers": 1,
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,
"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},
"train_iters": 143001,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 250,
# "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
"eval_interval": 143000,
"eval_iters": 10,
"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,
"tokenizer_type": "HFTokenizer",
"vocab_file": "/weka/pile/20B_tokenizer.json",
# "save": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "load": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "s3_path": "s3://s-eai-neox-west/hailey/mamba/test-ckpts/mamba-160m-pythia-test-conv-bias",
# "keep_last_n_checkpoints": 2,
"train_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"valid_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"test_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"launcher": "slurm",
"deepspeed_slurm": true,
# "account": "eleuther",
"no_ssh_check": true,
"use_wandb": true,
"wandb_group": "mamba-tp1-bs16-fusedinner",
"wandb_team": "eleutherai",
"wandb_project": "mamba-neox-tp-memsavings",
}
false
false
true
0
mmap
false
true
true
false
true
false
nccl
false
true
false
143000
10
2
false
false
true
true
2
12
Crashed
-
schoelkopf
13m 30s
-
silu
false
1000
false
["mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba"]
0
false
16
false
false
false
true
250
false
1
linear
false
1
{
"pipe_parallel_size": 0,
"model_parallel_size": 1,
"num_layers": 24,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",
"attention_config": [[["mamba"], 24]],
# "scaled_upper_triang_masked_softmax_fusion": true,
# "bias_gelu_fusion": true,
"mamba_selective_scan_fusion": true,
"mamba_causal_conv_fusion": true,
"mamba_inner_func_fusion": true,
"mamba_selective_fp32_params": true,
"activation": "silu",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-5,
"output_layer_init_method": "single_residual_scaled_normal",
# "init_method": "small_init",
# "output_layer_init_method": "wang_init",
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00006,
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},
"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 2,
"data_impl": "mmap",
"num_workers": 1,
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,
"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},
"train_iters": 143001,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 250,
# "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
"eval_interval": 143000,
"eval_iters": 10,
"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,
"tokenizer_type": "HFTokenizer",
"vocab_file": "/weka/pile/20B_tokenizer.json",
# "save": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "load": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "s3_path": "s3://s-eai-neox-west/hailey/mamba/test-ckpts/mamba-160m-pythia-test-conv-bias",
# "keep_last_n_checkpoints": 2,
"train_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"valid_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"test_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"launcher": "slurm",
"deepspeed_slurm": true,
# "account": "eleuther",
"no_ssh_check": true,
"use_wandb": true,
"wandb_group": "mamba-tp1-bs16-fusedinner",
"wandb_team": "eleutherai",
"wandb_project": "mamba-neox-tp-memsavings",
}
false
false
true
0
mmap
false
true
true
false
true
false
nccl
false
true
false
143000
10
2
false
false
true
true
2
12
Crashed
-
schoelkopf
13m 27s
-
silu
false
1000
false
["mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba","mamba"]
0
false
16
false
false
false
true
250
false
1
linear
false
1
{
"pipe_parallel_size": 0,
"model_parallel_size": 1,
"num_layers": 24,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",
"attention_config": [[["mamba"], 24]],
# "scaled_upper_triang_masked_softmax_fusion": true,
# "bias_gelu_fusion": true,
"mamba_selective_scan_fusion": true,
"mamba_causal_conv_fusion": true,
"mamba_inner_func_fusion": true,
"mamba_selective_fp32_params": true,
"activation": "silu",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-5,
"output_layer_init_method": "single_residual_scaled_normal",
# "init_method": "small_init",
# "output_layer_init_method": "wang_init",
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00006,
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},
"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 2,
"data_impl": "mmap",
"num_workers": 1,
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,
"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},
"train_iters": 143001,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 250,
# "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
"eval_interval": 143000,
"eval_iters": 10,
"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,
"tokenizer_type": "HFTokenizer",
"vocab_file": "/weka/pile/20B_tokenizer.json",
# "save": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "load": "/weka/hailey/mamba-ckpts/mamba-160m-pythia-test-conv-bias",
# "s3_path": "s3://s-eai-neox-west/hailey/mamba/test-ckpts/mamba-160m-pythia-test-conv-bias",
# "keep_last_n_checkpoints": 2,
"train_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"valid_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"test_data_paths": ["/weka/pile/pile_20B_tokenizer_text_document"],
"launcher": "slurm",
"deepspeed_slurm": true,
# "account": "eleuther",
"no_ssh_check": true,
"use_wandb": true,
"wandb_group": "mamba-tp1-bs16-fusedinner",
"wandb_team": "eleutherai",
"wandb_project": "mamba-neox-tp-memsavings",
}
false
false
true
0
mmap
false
true
true
false
true
false
nccl
false
true
false
143000
10
2
false
false
true
true
2
12
1-4
of 4