Saforem2's workspace
Runs
25
Name
21 visualized
machine: Perlmutter
machine: Perlmutter
1
2
machine: SunSpot
machine: SunSpot
2
15
machine: Polaris
machine: Polaris
1
2
machine: Aurora
machine: Aurora
2
2
State
Notes
User
Tags
Created
Runtime
Sweep
DIST_INFO.DEVICE
DIST_INFO.DEVICE_ID
DIST_INFO.DISTRIBUTED_BACKEND
DIST_INFO.GPUS_PER_NODE
DIST_INFO.HOSTFILE
DIST_INFO.HOSTNAME
DIST_INFO.HOSTS
DIST_INFO.LOCAL_RANK
DIST_INFO.MACHINE
DIST_INFO.NGPUS
DIST_INFO.NODE_ID
DIST_INFO.NUM_NODES
DIST_INFO.RANK
DIST_INFO.SCHEDULER
DIST_INFO.WORLD_SIZE_IN_USE
DIST_INFO.WORLD_SIZE_TOTAL
args.DDP_impl
args.accumulate_allreduce_grads_in_fp32
args.adam_beta1
args.adam_beta2
args.adam_eps
args.add_bias_linear
args.add_position_embedding
args.adlr_autoresume
args.adlr_autoresume_interval
args.apply_layernorm_1p
args.apply_query_key_layer_scaling
args.apply_residual_connection_post_layernorm
args.async_tensor_model_parallel_allreduce
args.attention_dropout
args.attention_softmax_in_fp32
args.barrier_with_L1_time
args.bert_binary_head
args.bert_embedder_type
args.bf16
args.bias_dropout_fusion
args.bias_gelu_fusion
args.biencoder_projection_dim
args.biencoder_shared_query_context_model
args.checkpoint_activations
args.checkpoint_in_cpu
args.checkpoint_num_layers
args.classes_fraction
args.clip_grad
Failed
-
saforem2
57m 31s
-
cuda
cuda:0
nccl
4
/global/homes/f/foremans/.slurm-nodefile
["nid008221","nid008493"]
["['nid008221', 'nid008224']","['nid008493', 'nid008513']"]
0
Perlmutter
8
0
2
0
SLURM
8
8
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
-
saforem2
19d 22h 5m 46s
-
["cuda","xpu"]
["cuda:0","xpu:0"]
["ccl","nccl"]
6.13333
["/global/homes/f/foremans/.slurm-nodefile","/var/spool/pbs/aux/8269282.amn-0001"]
["x1404c0s6b0n0h0.chn","x1404c2s2b0n0h0.chn","x1404c2s2b0n1h0.chn","x1404c3s1b0n0h0.chn","x1921c0s7b0n0.hostmgmt2000.cm.americas.sgi.com"]
["['nid008216', 'nid008380']","['nid008264', 'nid008265']","['nid008265', 'nid008268']","['nid008292', 'nid008293']","['x1921c0s7b0n0', 'x1922c6s5b0n0']"]
0
SunSpot
12.26667
0
2
0
PBS
12.26667
12.26667
local
true
0.9
0.999
1.0000e-8
false
false
false
1000
false
false
false
false
0.1
false
true
true
megatron
true
false
false
0
false
true
false
1
1
1
Crashed
-
saforem2
29m 1s
-
cuda
cuda:0
nccl
4
["/var/spool/pbs/aux/1704545.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov","/var/spool/pbs/aux/1704827.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov"]
["x3005c0s37b1n0.hsn.cm.polaris.alcf.anl.gov","x3006c0s19b1n0.hsn.cm.polaris.alcf.anl.gov"]
["['x3005c0s37b1n0', 'x3005c0s7b0n0']","['x3006c0s19b1n0', 'x3006c0s1b0n0']"]
0
Polaris
8
0
2
0
PBS
8
8
local
true
0.9
0.999
1.0000e-8
false
false
false
1000
false
false
false
false
0.1
false
true
true
megatron
false
false
false
0
false
true
false
1
1
1
Killed
-
saforem2
1h 34m 32s
-
xpu
xpu:0
ccl
12
["/var/spool/pbs/aux/618963.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov","/var/spool/pbs/aux/619040.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov"]
["x4309c5s3b0n0.hostmgmt2309.cm.aurora.alcf.anl.gov","x4417c5s5b0n0.hostmgmt2417.cm.aurora.alcf.anl.gov"]
["['x4309c5s3b0n0', 'x4309c5s4b0n0', 'x4309c5s6b0n0', 'x4309c5s7b0n0', 'x4309c6s2b0n0', 'x4309c6s3b0n0', 'x4309c6s6b0n0', 'x4309c6s7b0n0', 'x4309c7s1b0n0', 'x4309c7s5b0n0', 'x4310c0s5b0n0', 'x4310c1s0b0n0', 'x4310c1s1b0n0', 'x4310c1s2b0n0', 'x4310c1s5b0n0', 'x4310c2s0b0n0', 'x4310c2s2b0n0', 'x4310c2s3b0n0', 'x4310c2s4b0n0', 'x4310c2s5b0n0', 'x4310c2s7b0n0', 'x4310c3s0b0n0', 'x4310c3s1b0n0', 'x4310c3s2b0n0', 'x4310c3s5b0n0', 'x4310c3s7b0n0', 'x4310c4s1b0n0', 'x4310c4s4b0n0', 'x4310c4s5b0n0', 'x4310c4s6b0n0', 'x4310c5s0b0n0', 'x4310c5s3b0n0']","['x4417c5s5b0n0', 'x4417c5s6b0n0', 'x4417c5s7b0n0', 'x4417c6s0b0n0']"]
0
Aurora
216
0
18
0
PBS
216
216
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
1-4
of 4