Kastan's workspace
Runs
545
Name
545 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
BATCH_SIZE
HIDDEN_SIZE
LEARNING_RATE
MODE
NUM_EPOCHS
NUM_MICRO_BATCHES
PIPELINE
SEQ_LEN
SEQ_LENGTH
TENSOR_PARALLEL
TENSOR_PARALLEL_MODE
TENSOR_PARALLEL_SIZE
TENSOR_SHAPE
TOTAL_BATCH_SIZE
VOCAB_SIZE
WARMUP_EPOCHS
WEIGHT_DECAY
backend
clip_grad_norm
colossal_config_file
config
data_dir
fp16.mode
from_torch
gpt2_8B
gpt2_small
gpt2_xl
gradient_accumulation
loss.type
loss_fn.type
model.checkpoint
model.dtype
model.fuse_scale_mask_softmax
model.max_position_embeddings
model.type
model.vocab_size
optimizer.lr
optimizer.type
optimizer.weight_decay
parallel.pipeline
parallel.tensor.mode
parallel.tensor.size
zero.model_config.reuse_fp16_shard
zero.model_config.shard_strategy
Crashed
kastan
3h 58m 35s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
3h 58m 28s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
34m 59s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
2h 30m 55s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
2h 47m 28s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
3h 5m 38s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
6m 9s
-
1280
-
0.00015
-
60
8
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
torch.float16
true
1024
-
50304
0.00015
-
0.01
2
2.5d
16
-
-
Crashed
kastan
4m 37s
-
1280
-
0.00015
-
60
-
-
-
1024
-
2.5d
16
-
1280
50304
21
0.01
-
1
-
-
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
-
titans.model.gpt.gpt.gpt2_8B
-
titans.model.gpt.gpt.gpt2_xl
1
-
-
true
torch.float16
true
1024
-
50304
-
-
-
1
2.5d
16
-
-
Crashed
kastan
3m 36s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
kastan
32m 5s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
nccl
-
./configs/gpt2_2d.py
./configs/gpt2_2d.py
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
-
true
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Killed
kastan
3m 49s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
nccl
-
./configs/gpt2_vanilla.py
./configs/gpt2_vanilla.py
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
-
true
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Failed
kastan
12s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
nccl
-
./configs/gpt2_vanilla.py
./configs/gpt2_vanilla.py
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
-
true
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
kastan
10m 47s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
nccl
-
./configs/gpt2_vanilla.py
./configs/gpt2_vanilla.py
/u/kastanday/LLM-Distributed-Quantization/datasets/small-gpt-dataset.json
-
true
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
kastan
10m 38s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
nccl
-
./configs/gpt2_vanilla.py
./configs/gpt2_vanilla.py
/u/kastanday/new_colossal_ai/ColossalAI/benchmark/gpt/kas_datasets/small-gpt-dataset.json
-
true
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
kastan
4d 10h 30m 25s
-
118.17568
768
-
-
60
7.87879
1
1024
-
7.90411
-
-
-
-
-
-
-
nccl
-
["/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_1d_8B.py","/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_3d_PP4_8B.py","/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_3d_manual_8B.py","/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_pp_manual.py"]
["/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_1d_8B.py","/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_3d_PP4_8B.py","/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_3d_manual_8B.py","/u/kastanday/new_colossal_ai/ColossalAI/examples/language/gpt/gpt2_configs/gpt2_pp_manual.py"]
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
false
titans.model.gpt.gpt.gpt2_8B
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
["titans.model.gpt.gpt.gpt2_8B","titans.model.gpt.gpt.gpt2_small"]
-
0.00015
torch.optim.adam.Adam
0.01
3.63014
["1d","3d"]
7.81081
-
-
Crashed
kastan
36m 38s
-
8
-
-
2d
60
4
2
1024
-
4
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_2d_TP4_PP2_8B.py
gpt2_configs/gpt2_2d_TP4_PP2_8B.py
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
false
titans.model.gpt.gpt.gpt2_8B
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_8B
-
0.00015
torch.optim.adam.Adam
0.01
2
2d
4
-
-
Crashed
kastan
13m 26s
-
4
-
-
-
60
-
-
1024
-
8
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_3d_manual_8B.py
gpt2_configs/gpt2_3d_manual_8B.py
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
false
titans.model.gpt.gpt.gpt2_8B
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_8B
-
0.00015
torch.optim.adam.Adam
0.01
1
3d
8
-
-
Finished
kastan
5m 3s
-
4
-
-
-
60
-
-
1024
-
8
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_3d_manual.py
gpt2_configs/gpt2_3d_manual.py
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
false
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
3d
8
-
-
Finished
kastan
1m 22s
-
1
-
-
-
60
-
-
1024
-
2
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_1d_manual.py
gpt2_configs/gpt2_1d_manual.py
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
false
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
1d
2
-
-
Crashed
kastan
11m 56s
-
8
768
-
-
60
4
1
1024
-
-
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_pp_manual.py
gpt2_configs/gpt2_pp_manual.py
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
false
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
-
1
-
-
Failed
kastan
5m 17s
-
8
768
-
-
60
4
5
1024
-
-
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_pp_XL.py
gpt2_configs/gpt2_pp_XL.py
/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
titans.model.gpt.gpt.gpt2_xl
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_xl
-
0.00015
torch.optim.adam.Adam
0.01
5
-
1
-
-
Failed
kastan
2d 1h 31m 59s
-
4
-
-
-
60
-
-
1024
-
8
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_3d.py
gpt2_configs/gpt2_3d.py
["/u/kastan/colossal/data/train_data_FINAL.json","/u/kastanday/colossal_ai/raw_json_backup/train_data_FINAL.json"]
AMP_TYPE.NAIVE
true
titans.model.gpt.gpt.gpt2_8B
titans.model.gpt.gpt.gpt2_small
titans.model.gpt.gpt.gpt2_xl
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
["titans.model.gpt.gpt.gpt2_8B","titans.model.gpt.gpt.gpt2_small","titans.model.gpt.gpt.gpt2_xl"]
-
0.00015
torch.optim.adam.Adam
0.01
1
3d
8
-
-
Finished
kastan
10m 40s
-
8
768
-
1d
60
4
2
1024
-
2
-
-
598
-
-
-
-
nccl
-
gpt2_configs/gpt2_pp1d.py
gpt2_configs/gpt2_pp1d.py
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
-
titans.loss.vocab_cross_entropy.vocab_cross_entropy.vocab_parallel_cross_entropy
true
torch.float16
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
2
1d
2
-
-
Crashed
kastan
8m 28s
-
8
768
-
-
60
4
2
1024
-
-
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_pp.py
gpt2_configs/gpt2_pp.py
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
2
-
1
-
-
Failed
kastan
1d 57m 45s
-
2
-
-
-
60
-
-
1024
-
-
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_zero3.py
gpt2_configs/gpt2_zero3.py
/u/kastan/colossal/data/train_data_FINAL.json
-
true
-
titans.model.gpt.gpt.gpt2_small
-
-
-
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
colossalai.nn.optimizer.hybrid_adam.HybridAdam
0.01
-
-
-
true
["<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x1457cac434c0>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x1462da3f3310>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x148dc484e190>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x14c5a94c72b0>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x14dc247654c0>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x150e7506b340>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x152b4cc525e0>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x1531bb88c430>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x1537ebac04f0>","<colossalai.zero.shard_utils.tensor_shard_strategy.TensorShardStrategy object at 0x1549b257e5e0>"]
Crashed
kastan
17m 14s
-
8
768
-
-
60
4
2
1024
-
-
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_pp_2gpu.py
gpt2_configs/gpt2_pp_2gpu.py
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
2
-
1
-
-
Finished
kastan
7m 28s
-
1
-
-
-
60
-
-
1024
-
-
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_vanilla.py
gpt2_configs/gpt2_vanilla.py
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
-
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
-
1
-
-
Finished
kastan
8m 58s
-
4
-
-
-
60
-
-
1024
-
4
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_2d.py
gpt2_configs/gpt2_2d.py
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
2d
4
-
-
Finished
kastan
4m 34s
-
1
-
-
-
60
-
-
1024
-
2
-
-
-
-
-
-
-
nccl
-
gpt2_configs/gpt2_1d.py
gpt2_configs/gpt2_1d.py
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
1d
2
-
-
Failed
kastan
15h 17m 30s
-
1.59574
768
-
-
60
4
1
1024
-
2
-
-
-
-
-
-
-
nccl
-
["gpt2_configs/gpt2_1d.py","gpt2_configs/gpt2_pp.py"]
["gpt2_configs/gpt2_1d.py","gpt2_configs/gpt2_pp.py"]
/u/kastan/colossal/data/train_data_FINAL.json
AMP_TYPE.NAIVE
true
-
titans.model.gpt.gpt.gpt2_small
-
-
titans.loss.lm_loss.gpt_lmloss.GPTLMLoss
-
true
-
-
-
titans.model.gpt.gpt.gpt2_small
-
0.00015
torch.optim.adam.Adam
0.01
1
1d
1.91489
-
-
1-30
of 30