Taishi-nakamura's workspace
Runs
167
Name
12 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
accumulate_allreduce_grads_in_fp32
activation_function
adam_beta1
adam_beta2
adam_eps
add_bias_linear
add_position_embedding
add_qkv_bias
adlr_autoresume
adlr_autoresume_interval
apply_layernorm_1p
apply_query_key_layer_scaling
apply_residual_connection_post_layernorm
apply_rope_fusion
async_tensor_model_parallel_allreduce
attention_dropout
attention_softmax_in_fp32
auto_detect_ckpt_format
barrier_with_L1_time
base_model
bert_binary_head
bert_embedder_type
bf16
bias_dropout_fusion
bias_gelu_fusion
bias_swiglu_fusion
biencoder_projection_dim
biencoder_shared_query_context_model
check_for_nan_in_loss_and_grad
checkpoint_type
ckpt_fully_parallel_save
classes_fraction
clip_grad
clone_scatter_output_in_embedding
consumed_train_samples
consumed_train_tokens
consumed_valid_samples
context_parallel_size
continual_pretraining
create_attention_mask_in_dataloader
Finished
-
taishi-nakamura
8x1.5B
DUS-0.5
2d 16h 22m 50s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/shared/experiments/0141_drop_upcycling/checkpoints_init/Mixtral_dp_0.5_8x1.5B
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
DUS-0.5
4d 3h 37s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/shared/experiments/0141_drop_upcycling/checkpoints_init/Mixtral_dp_0.5_8x1.5B
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Finished
-
taishi-nakamura
8x152M
DUS-0.5
3d 25m 35s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/shared/experiments/0141_drop_upcycling/checkpoints_init/Mixtral_dp_0.5_8x152M
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Finished
-
taishi-nakamura
8x1.5B
RNU-0.5
5d 5m 30s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/taishi/workspace/drop-upcycle/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_rand_init_0.50_noise
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
1d 6h 45m 59s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/taishi/workspace/drop-upcycle/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_rand_init_0.50_noise
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
RNU-0.5
4d 13h 36m 39s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/taishi/workspace/drop-upcycle/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_rand_init_0.50_noise
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
RNU-0.5
1d 4h 17m 36s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/taishi/workspace/drop-upcycle/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_rand_init_0.50_noise
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Finished
-
taishi-nakamura
8x152M
RNS-0.5
1d 4h 8m 40s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/home/taishi/workspace/drop-upcycle/checkpoints/upcycle-Mixtral-8x152M-shuffle_torch_rand_002_iter_0477000_random_rand_init_0.50_noise
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
14h 41m 18s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x1.56B-btx_torch_rand_002
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
1d 7h 53m 37s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/from_scratch_Mixtral-8x1.56B/
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
1d 8h 4m 25s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_init_1.0/
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
1d 8h 11m 56s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x1.56B-torch_rand_002
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
1d 8h 14m 22s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_init_0.5/
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
ablation
layer-wise-lb
17h 41m 37s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x1.56B-btx_torch_rand_002
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x152M
RNS-0.5
1d 21h 43m 47s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x152M-shuffle_torch_rand_002_iter_0477000_random_rand_init_0.50_noise/
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Finished
-
taishi-nakamura
8x152M
BTX
3d 9m 5s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x152M-btx_torch_rand_002
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x152M
BTX
6m 17s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x152M-btx_torch_rand_002
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Crashed
-
taishi-nakamura
8x1.5B
DUI-0.75
ablation
12h 13m 18s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/upcycle-Mixtral-8x1.56B-shuffle_torch_rand_002_random_init_0.75/
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Finished
-
taishi-nakamura
8x3.7B
FS
1d 20h 57m 22s
-
-
silu
0.9
0.95
1.0000e-8
-
-
-
-
-
-
-
-
-
-
0.1
-
-
-
/gs/bs/tgh-NII-LLM/checkpoints/from_scratch_Mixtral-8x3.78B
-
-
true
-
-
-
-
-
-
LOCAL_STATE_DICT
-
-
-
-
-
-
-
-
true
-
Finished
-
taishi-nakamura
152M
BTX
expert
5h 41m 33s
-
true
-
0.9
0.95
1.0000e-8
false
false
false
false
1000
false
false
false
true
true
0
true
false
true
-
true
megatron
true
true
false
true
0
false
true
-
false
1
1
true
0
0
0
1
-
true
1-20
of 167