Borzunov's workspace
Runs
185
Name
185 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
bmuf.average_sync
bmuf.block_lr
bmuf.block_momentum
bmuf.distributed_world_size
bmuf.global_sync_iter
bmuf.use_nbm
bmuf.warmup_iterations
checkpoint.best_checkpoint_metric
checkpoint.checkpoint_shard_count
checkpoint.checkpoint_suffix
checkpoint.keep_best_checkpoints
checkpoint.keep_interval_updates
checkpoint.keep_interval_updates_pattern
checkpoint.keep_last_epochs
checkpoint.load_checkpoint_on_all_dp_ranks
checkpoint.maximize_best_checkpoint_metric
checkpoint.model_parallel_size
checkpoint.no_epoch_checkpoints
checkpoint.no_last_checkpoints
checkpoint.no_save
checkpoint.no_save_optimizer_state
checkpoint.optimizer_overrides
checkpoint.patience
checkpoint.reset_dataloader
checkpoint.reset_lr_scheduler
checkpoint.reset_meters
checkpoint.reset_optimizer
checkpoint.restore_file
checkpoint.save_dir
checkpoint.save_interval
checkpoint.save_interval_updates
checkpoint.write_checkpoints_asynchronously
common.all_gather_list_size
common.amp
common.amp_batch_retries
common.amp_init_scale
common.azureml_logging
common.bf16
common.cpu
common.empty_cache_freq
common.fp16
common.fp16_init_scale
common.fp16_no_flatten_grads
common.fp16_scale_tolerance
Crashed
-
yhn112
22h 19m 32s
-
false
1
0.875
1
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.XB9bRo4rpN
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
2h 54m 38s
-
false
1
0.875
1
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.vP64v7Pb9p
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
7h 33m 30s
-
false
1
0.875
1
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.STovi4hx27
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
3d 1h 1m 53s
-
false
1
0.875
1
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.PQmJkomQ7a
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
24s
-
false
1
0.875
8
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.w3UdDiLZom
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
4d 23h 17m 9s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.Jbrw0BLie9
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
5d 22h 43m 30s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.l3PFNVqdui
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
1d 16h 43m 40s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.egASrfUkcZ
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
7m 34s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.6LA9Vrv9ak
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
3h 30m 4s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.PofORSSh8z
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
1h 16m 15s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.0ZrVDfoSvG
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
20m 42s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.gxPDN8FeV8
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
35m 51s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.roUojDk5Os
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
1m 29s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.ILeNnqTiza
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
48m 59s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.ZnHdKfFg8M
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
19m 41s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.SIUfAl98BH
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
12m 35s
-
false
1
0.875
2
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.AM8iKyFIEE
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
8d 6h 3m 40s
-
false
1
0.875
8
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.ii0X7fFeMg
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
8d 5h 39m 54s
-
false
1
0.875
8
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.PthfAiswGa
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
Crashed
-
yhn112
5d 17h 22m 54s
-
false
1
0.875
8
50
false
500
loss
1
0
2
-1
-1
false
false
1
true
false
false
false
{}
-1
false
false
false
false
checkpoint_last.pt
/tmp/job/tmp.QG3fsA9RK6
1
1000
false
16384
true
2
128
false
false
false
0
false
128
true
0
1-20
of 185