Shunk031's workspace
Runs
24
State
Notes
User
Tags
Created
Runtime
Sweep
abci_user
algorithms.gradient_clipping.clipping_threshold
algorithms.gradient_clipping.clipping_type
auto_resume
autoresume
callbacks.mono_ckpt_saver.batch_interval
callbacks.mono_ckpt_saver.save_folder
callbacks.speed_monitor.window_size
console_log_interval
device_eval_batch_size
device_train_batch_size
device_train_grad_accum
device_train_microbatch_size
dist_timeout
enabled_algorithms/GradientClipping
eval_first
eval_interval
fsdp_config.activation_checkpointing
fsdp_config.activation_checkpointing_reentrant
fsdp_config.activation_cpu_offload
fsdp_config.limit_all_gathers
fsdp_config.mixed_precision
fsdp_config.sharded_ckpt_prefix_dir
fsdp_config.sharding_strategy
fsdp_config.state_dict_type
fsdp_config.verbose
global_seed
global_train_batch_size
log_to_console
loggers.wandb.entity
loggers.wandb.name
loggers.wandb.project
max_duration
max_seq_len
model.config_overrides.attn_config.attn_impl
model.config_overrides.attn_config.attn_uses_sequence_id
model.config_overrides.max_seq_len
model.init_device
model.name
model.pretrained
model.pretrained_model_name_or_path
n_gpus
n_params
node_name
Crashed
-
shunk031
25m 1s
-
acf15658su
1
norm
-
true
1
/scratch/acf15658su/exp03/mono_checkpoints
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
ba{batch}-shards
FULL_SHARD
sharded
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
51m 1s
-
acf15658su
1
norm
-
true
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
ba{batch}-shards
FULL_SHARD
sharded
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
46s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
unknown because NODENAME environment variable not set
Crashed
-
shunk031
20m 1s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
ba{batch}-shards
FULL_SHARD
sharded
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
8h 50m 34s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
ba{batch}-shards
FULL_SHARD
sharded
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
1h 14m 2s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Failed
-
shunk031
54s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
1h 21m 32s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
1h 1m
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1120
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
140
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
1h 2m 1s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Failed
-
shunk031
56s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
24m 31s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
52s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
50m 58s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
51m 32s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
46m 31s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1088
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
136
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
22m 31s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1024
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
128
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
32m 1s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
1024
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
128
29957200896
unknown because NODENAME environment variable not set
Crashed
-
shunk031
40m 2s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
960
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
120
29957200896
unknown because NODENAME environment variable not set
Failed
-
shunk031
20m 57s
-
acf15658su
1
norm
true
-
-
-
10
1ba
1
8
8
1
600
true
false
1
true
false
true
true
PURE
-
FULL_SHARD
-
true
19950815
896
true
shunk031
mpt-30b
abci-llm-distributed-training-hackathon-01
375000ba
2048
torch
false
2048
mixed
hf_causal_lm
true
mosaicml/mpt-30b
112
29957200896
unknown because NODENAME environment variable not set
1-20
of 24