Kastan's workspace
Runs
614
Name
614 visualized
State
Notes
Tags
Created
Runtime
GPU Count
model.layernorm_dtype
model.decoder_dtype
model.embed_dtype
model.head_dtype
LOG_PATH
clip_grad_norm
fp16.mode
gradient_accumulation
total_gpus
MICRO_BATCH_SIZE
NUM_MICRO_BATCHES
PIPELINE_SIZE
TENSOR_PARALLEL_MODE
TENSOR_PARALLEL_SIZE
loss.type
model.dtype
model.fuse_scale_mask_softmax
parallel.tensor.depth
parallel.tensor.mode
parallel.tensor.size
schedule.num_microbatches
schedule.scatter_gather_tensors
schedule.tensor_shape
schedule.type
loss
per_step_metrics.Tflops
per_step_metrics.samples_per_sec
decoder_layer_norm_mean
embed_layer_norm
head_layer_norm
layer_norm
decoder_norm_layer0
decoder_norm_layer1
decoder_norm_layer10
decoder_norm_layer11
decoder_norm_layer2
Finished
10m 15s
-
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
./gpt2_bs4_lr0.00015/
-
-
-
2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
40.5
24.82
215.12
-
0
-
-
-
-
-
-
-
Finished
7m 1s
-
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
./gpt2_bs4_lr0.00015/
-
-
-
1
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
40.5
23.489
101.79
-
1.52429
1.52368
-
-
-
-
-
-
Crashed
0s
-
torch.float32
torch.float32
torch.float32
torch.bfloat16
./quant_gpt2_2.5d_tp4_bs32_lr0.00015/
1
AMP_TYPE.NAIVE
4
-
-
4
-
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
28m 18s
-
-
-
-
-
./gpt2_2.5d_tp16_bs1280_lr0.00015_accum1_clip_grad1.0/
1
AMP_TYPE.NAIVE
1
32
-
8
-
2.5d
16
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
true
1
2.5d
16
-
-
-
-
240.59171
178.13
14.31675
-
0
-
-
-
-
-
-
-
Crashed
28m 2s
-
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
./gpt2_2.5d_tp16_bs1280_lr0.00015_accum1_clip_grad1.0/
1
-
1
32
-
8
-
2.5d
16
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
1
2.5d
16
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
29m 4s
-
-
-
-
-
./gpt2_2.5d_tp16_bs1280_lr0.00015_accum1_clip_grad1.0/
1
AMP_TYPE.NAIVE
1
32
-
8
-
2.5d
16
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
true
1
2.5d
16
-
-
-
-
246.48434
160.8675
12.929
-
0
-
-
-
-
-
-
-
Crashed
28m 37s
-
-
-
-
-
-
1
AMP_TYPE.NAIVE
1
16
4
8
2
2d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
-
2d
4
8
true
1366.66667
colossalai.engine.schedule._pipeline_schedule.PipelineSchedule
131.58856
3.87243
4.08957
-
0
-
-
-
-
-
-
-
Crashed
2m 15s
-
-
-
-
-
-
1
AMP_TYPE.NAIVE
1
8
4
8
2
2d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
-
2d
4
8
true
1366.66667
colossalai.engine.schedule._pipeline_schedule.PipelineSchedule
131.37064
2.5304
1.3362
-
0
-
-
-
-
-
-
-
Crashed
29m 26s
-
-
-
-
-
-
1
AMP_TYPE.NAIVE
1
8
-
-
-
3d
8
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
-
3d
8
-
-
-
-
10.34052
3.55814
1.33033
-
0
-
-
-
-
-
-
-
Crashed
29m 52s
-
-
-
-
-
-
1
AMP_TYPE.NAIVE
1
4
-
-
-
1d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
-
1d
4
-
-
-
-
28.89033
8.2135
3.10255
-
0
-
-
-
-
-
-
-
Crashed
6m 58s
-
torch.float16
torch.float16
torch.float16
torch.float16
["./gpt2_2.5d_tp4_bs64_lr0.00015_accum1_clip_grad1.0/","./quant_gpt2_3d_tp8_bs64_lr0.00015/"]
1
AMP_TYPE.NAIVE
1
32
4
16
2
["2.5d","3d"]
6
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
1
["2.5d","3d"]
6
16
true
1366.66667
colossalai.engine.schedule._pipeline_schedule.PipelineSchedule
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
3m 5s
-
-
-
-
-
./gpt2_2.5d_tp4_bs32_lr0.00015_accum2_clip_grad1.0/
1
AMP_TYPE.NAIVE
2
32
4
8
2
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
3m 13s
-
-
-
-
-
./gpt2_2.5d_tp4_bs32_lr0.00015_accum2_clip_grad1.0/
1
AMP_TYPE.NAIVE
2
64
4
8
2
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
11m 24s
-
-
-
-
-
./gpt2_2.5d_tp4_bs32_lr0.00015_accum2_clip_grad1.0/
1
AMP_TYPE.NAIVE
2
64
4
8
2
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
torch.float16
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
41s
-
torch.float16
torch.float16
torch.float16
torch.float16
./quant_gpt2_3d_tp8_bs16_lr0.00015/
1
AMP_TYPE.NAIVE
4
64
4
16
2
3d
8
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
-
-
3d
8
16
true
1366.66667
colossalai.engine.schedule._pipeline_schedule.PipelineSchedule
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
37s
-
torch.float32
torch.float32
torch.float32
torch.bfloat16
./quant_gpt2_2.5d_tp4_bs32_lr0.00015/
1
AMP_TYPE.NAIVE
4
32
-
4.5
-
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
4m 15s
-
torch.float32
torch.float32
torch.float32
torch.bfloat16
./quant_gpt2_2.5d_tp4_bs32_lr0.00015/
1
AMP_TYPE.NAIVE
4
-
-
8
-
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Failed
50s
-
torch.float32
torch.float32
torch.float32
torch.bfloat16
./quant_gpt2_2.5d_tp4_bs8_lr0.00015/
1
AMP_TYPE.NAIVE
4
-
-
8
-
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Failed
56s
-
torch.float32
torch.float32
torch.float32
torch.bfloat16
./quant_gpt2_2.5d_tp4_bs8_lr0.00015/
1
AMP_TYPE.NAIVE
4
-
-
8
-
2.5d
4
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
-
1
2.5d
4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
7m 16s
-
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
./gpt2_bs4_lr0.00015/
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
40.5
24.654
213.68
-
0
-
-
-
-
-
-
-
1-20
of 43