Aran's workspace
Runs
514
Name
20 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
act_fn
anneal_steps
bf16_optimizer
bucket
ckpt_every
comment
cores_per_replica
d_head
d_model
dconv
early_collect
end_lr
eos_token_id
eval_harness_tasks
ffn_lr_mult
gradient_accumulation_steps
head_mode
is_flan_dataset
k_lr_mult
keep_every
layers
lr
mask_token_id
mlm_probability
model_dir
n_heads
n_vocab
name
norm
num_experts
optimizer
pe
pe_rotary_dims
pe_rotary_polar
per_replica_batch
seq
total_steps
tpu_name
tpu_size
tpus_per_replica
train_set
val_batch_multiplier
val_batches
val_every
Finished
-
aran
6h 32m 26s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final5
8
50400
baseline5
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
6h 43m 54s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-dense2
8
50400
moe-dense2
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Failed
-
aran
39s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final5
8
50400
baseline5
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Failed
-
aran
4m 29s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-dense2
8
50400
moe-dense2
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Failed
-
aran
20h 57m 29s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final5
8
50400
baseline5
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Failed
-
aran
23h 58m 20s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-dense2
8
50400
moe-dense2
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
11h 24m 17s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-dense
8
50400
moe-dense
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
2m 51s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-dense
8
50400
moe-dense
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Failed
-
aran
15h 58m 52s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final4
8
50400
baseline4
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
3m 5s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final3
8
50400
baseline3
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
2m 10s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final3
8
50400
baseline3
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
30s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final2
8
50400
baseline2
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1m 27s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final
8
50400
baseline
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
2m 22s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
moe-final
8
50400
GPT3_6B_pile_rotary
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
2048
300000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
27s
-
-
1000000
-
neo-models
10000
1
-
768
-
-
0.00006
-
[]
-
1
-
-
-
10000
12
0.0006
-
-
mtj-glm/baseline_small
6
50400
baseline_small
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
1000000
-
32
-
pile.train.index
-
100
2000
Killed
-
aran
3h 19m 31s
-
-
1000000
-
neo-models
10000
1
-
768
-
-
0.00006
-
[]
-
1
-
-
-
10000
12
0.0006
-
-
mtj-glm/bert2
6
50400
bert2
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
1000000
-
32
-
pile.train.index
-
100
2000
Killed
-
aran
27m 56s
-
-
1000000
-
neo-models
10000
1
-
768
-
-
0.00006
-
[]
-
1
-
-
-
10000
12
0.0006
-
-
mtj-glm/bert
6
50400
bert
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
1000000
-
32
-
pile.train.index
-
100
2000
Failed
-
aran
1d 4m 47s
-
-
1000000
-
neo-models
10000
1
-
768
-
-
0.00006
-
[]
-
1
-
-
-
10000
12
0.0006
-
-
mtj-glm/baseline_small
6
50400
baseline_small
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
1000000
-
32
-
pile.train.index
-
100
2000
Killed
-
aran
12m 40s
-
-
1000000
-
neo-models
10000
1
-
768
-
-
0.00006
-
[]
-
1
-
-
-
10000
12
0.0006
-
-
mtj-glm/bert
6
50400
bert
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
1000000
-
32
-
pile.train.index
-
100
2000
Killed
-
aran
4m 6s
-
-
1000000
-
neo-models
10000
1
-
768
-
-
0.00006
-
[]
-
2
-
-
-
10000
12
0.0006
-
-
mtj-glm/bert
6
50400
bert
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
1000000
-
32
-
pile.train.index
-
100
2000
Killed
-
aran
3h 26m 3s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
2
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm18
8
50400
glm18
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
128
-
pile.train.index
-
100
2000
Failed
-
aran
11h 35m 7s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
2
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm18
8
50400
glm18
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
1h 2m 30s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm18
8
50400
glm18
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
256
-
pile.train.index
-
100
2000
Failed
-
aran
1d 5m 18s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline
8
50400
baseline
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
256
-
pile.train.index
-
100
2000
Failed
-
aran
1d 4m 47s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm17
8
50400
glm17
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
256
-
pile.train.index
-
100
2000
Killed
-
aran
2h 53m 12s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm16
8
50400
glm16
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
19m 16s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm16
8
50400
glm16
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
1m 35s
-
-
300000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm16
8
50400
glm16
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
1
256
300000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
6h 4m 27s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm15
8
50400
glm15
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
100000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
43m 39s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm15
8
50400
glm15
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
100000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
1d 7h 55m 30s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
4
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm14-8cores
8
50400
glm14-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
22m 10s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
4
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm14-8cores
8
50400
glm14-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1h 46m 32s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
4
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm13-8cores
8
50400
glm13-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
9h 18m 6s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
4
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm12-8cores
8
50400
glm12-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
2
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
9h 33m 47s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
8
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm11-8cores
8
50400
glm11-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
2h 18m 3s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
8
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm10-8cores
8
50400
glm10-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
2m 6s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm10-8cores
8
50400
glm10-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
4m 10s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm10-8cores
8
50400
glm10-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
32
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
14m 22s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm10-8cores
8
50400
glm10-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
11m 2s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm10-8cores
8
50400
glm10-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
8m 20s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm9-8cores
8
50400
glm9-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
39m 8s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm8-8cores
8
50400
glm8-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
4m 13s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm7-8cores
8
50400
glm7-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
2h 31m 31s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm7-8cores
8
50400
glm7-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
32m 37s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm7-8cores
8
50400
glm7-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
none
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
2h 39m 51s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm6-8cores
8
50400
glm6-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Crashed
-
aran
52m 7s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm6-8cores
8
50400
glm6-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1h 18m 25s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm5-8cores
8
50624
glm5-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1h 13m 6s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm4-8cores
8
50624
glm4-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
5m 34s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm3-8cores
8
50624
glm3-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1h 7m 7s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm-8cores
8
50624
glm-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1h 33m 34s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/glm2-8cores
8
50624
glm2-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
4m 26s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline2-8cores
8
50624
baseline2-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
aran
4s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
9s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
27s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
19s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
23s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
15s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
aran
11s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
aran
43m 5s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/posemb-8cores
8
50624
posemb-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
48m 28s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
35s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
13m 47s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
16s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
16s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
12m 21s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
20s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
aran
7m 7s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
2048
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
19m 37s
-
-
100000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtj-glm/baseline-8cores
8
50624
baseline-8cores
layernorm
-
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
64
-
16
256
100000
-
8
-
pile.train.index
-
100
2000
Failed
-
bmk
1h 42m 4s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-h-4
16
50400
scaling-h-4
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
7m 46s
-
-
200000
-
neo-models
10000
1
-
256
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-h-4
4
50400
scaling-h-4
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
aran
1m 13s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-ffn_not_branched
8
50400
scaling-ffn_not_branched
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
8
-
pile.train.index
-
100
2000
Failed
-
bmk
2h 25m 33s
-
-
200000
-
neo-models
10000
1
-
256
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-multi-conv
4
50400
scaling-multi-conv
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
1h 13m 39s
-
-
200000
-
neo-models
10000
1
-
256
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-multi-conv
4
50400
scaling-multi-conv
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
aran
1m 29s
-
-
200000
-
neo-models
10000
1
-
256
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-ffn_not_branched
16
50400
scaling-ffn_not_branched
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
1m 35s
-
-
200000
-
neo-models
10000
1
-
256
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-ffn_not_branched
16
50400
scaling-ffn_not_branched
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
bmk
2h 14m 48s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-multi-conv
16
50400
scaling-multi-conv
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Killed
-
aran
2m 53s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-mc
16
50400
scaling-mc
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
bmk
2h 37m 3s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-mc
16
50400
scaling-mc
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
aran
4m 13s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-mc
16
50400
scaling-mc
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
aran
3h 16m 25s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-mc
16
50400
scaling-mc
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
bmk
2h 10m 24s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
-
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-soft8
16
50400
scaling-soft8
layernorm
8
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Failed
-
bmk
53m 13s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
fff
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-soft8
16
50400
scaling-soft8
layernorm
8
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Failed
-
bmk
1h 35m 4s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
[]
-
1
fff
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-soft8
16
50400
scaling-soft8
layernorm
8
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
aran
8s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
fff
-
-
20000
8
0.0006
-
-
mtf-scaling-models/scaling-softmax-test
16
50400
scaling-softmax-test
layernorm
8
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
2048
100000
-
8
-
pile.train.index
-
100
2000
Failed
-
bmk
6h 36m 14s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
ppp
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-ppp
16
50400
scaling-ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
8h 38m 25s
-
-
200000
-
neo-models
10000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
fff
-
-
10000
8
0.0006
-
-
mtf-scaling-models/scaling-fff
16
50400
scaling-fff
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
5h 5m 7s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
ppp
-
-
20000
8
0.0006
-
-
mtf-scaling-models/scaling-ppp
16
50400
scaling-ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
12h 15m 13s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
ppp
-
-
20000
8
0.0006
-
-
mtf-scaling-models/scaling-ppp
16
50400
scaling-ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Failed
-
bmk
5m 31s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
ppp
-
-
20000
8
0.0006
-
-
mtf-scaling-models/scaling-ppp
16
50400
scaling-ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
12h 49m 11s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
fff
-
-
20000
8
0.0006
-
-
mtf-scaling-models/scaling-fff
16
50400
scaling-fff
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
2
2048
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
aran
2m 51s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
fpp
-
-
20000
8
0.0006
-
-
mtf-scaling-models/ppf
16
50400
ppf
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
2048
100000
-
8
-
pile.train.index
-
100
2000
Killed
-
aran
46m 35s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
ppp
-
-
20000
8
0.0006
-
-
mtf-scaling-models/ppp
16
50400
ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
2048
100000
-
8
-
pile.train.index
-
100
2000
Finished
-
bmk
2m 40s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
ppp
-
-
20000
8
0.0006
-
-
mtf-scaling-models/ppp
16
50400
ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
2048
100000
-
128
-
pile.train.index
-
100
2000
Killed
-
bmk
8m 6s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
fff
-
-
20000
8
0.0006
-
-
mtf-scaling-models/ppp
16
50400
ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
1024
100000
-
128
-
pile.train.index
-
100
2000
Finished
-
bmk
6m 33s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
["lambada","piqa","hellaswag","winogrande","mathqa","pubmedqa"]
-
1
fff
-
-
20000
8
0.0006
-
-
mtf-scaling-models/ppp
16
50400
ppp
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
1024
100000
-
128
-
pile.train.index
-
100
2000
Failed
-
bmk
7h 57m 24s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
baseline
-
-
20000
8
0.0006
-
-
mtf-scaling-models/lr-4x-ffn
16
50400
lr-4x-ffn
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
1024
100000
-
128
-
pile.train.index
-
100
2000
Failed
-
bmk
8h 20m 53s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
baseline
-
-
20000
8
0.0006
-
-
mtf-scaling-models/lr-4x-attn
16
50400
lr-4x-attn
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
1024
100000
-
128
-
pile.train.index
-
100
2000
Failed
-
bmk
1h 22m 54s
-
-
200000
-
neo-models
20000
1
-
1024
-
-
0.00006
-
[]
-
1
baseline
-
-
20000
8
0.0006
-
-
mtf-scaling-models/lr-4x-attn
16
50400
lr-4x-attn
layernorm
1
["optax._src.combine.chain.<locals>.init_fn","optax._src.combine.chain.<locals>.update_fn"]
rotary
32
-
4
1024
100000
-
128
-
pile.train.index
-
100
2000
1-100
of 514