Awni00's workspace
Runs
48
Name
48 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
activation
always_save_checkpoint
batch_size
beta1
beta2
bias
compile
d_model
datetime_now
decay_lr
device
dropout_rate
dtype
eval_interval
eval_iters
eval_only
grad_clip
gradient_accumulation_steps
init_from
learning_rate
log_interval
max_iters
max_seq_len
model_name
n_layers
n_params
n_params_wo_embedding
norm_first
out_dir
pos_enc_type
rca
rca_type
sa
sym_attn_n_symbols
symbol_type
symmetric_rels
trainable_symbols
use_cosine_sched
vocab_size
vocab_source
wandb_log
wandb_project
wandb_run_name
Finished
awni00
21h 27m 30s
-
swiglu
false
64
0.9
0.95
false
false
128
2024_05_18_17_24_55
true
cuda
0.1
bfloat16
2000
100
false
1
4
scratch
0.001
1
100000
512
sa=4; rca=4; d=128; L=6; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE
6
10005120
1781120
true
out/sa=4; rca=4; d=128; L=6; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_18_17_24_55
RoPE
4
disentangled_v2
4
512
sym_attn
true
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=128; L=6; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_18_17_24_55
Finished
awni00
21h 32m 3s
-
swiglu
false
64
0.9
0.95
false
false
128
2024_05_18_17_23_56
true
cuda
0.1
bfloat16
2000
100
false
1
4
scratch
0.001
1
100000
512
sa=4; rca=4; d=128; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
6
10103424
1879424
true
out/sa=4; rca=4; d=128; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_18_17_23_56
RoPE
4
disentangled_v2
4
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=128; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_18_17_23_56
Finished
awni00
5h 16m 19s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_12_34_33
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=4; rca=4; d=64; L=6; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
6
4583616
455616
true
out/sa=4; rca=4; d=64; L=6; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_12_34_33
RoPE
4
standard
4
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=64; L=6; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_12_34_33
Finished
awni00
4h 28m
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_10_06_22
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=6; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
6
4586688
458688
true
out/sa=6; rca=2; d=64; L=6; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_10_06_22
RoPE
2
standard
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=6; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_10_06_22
Finished
awni00
4h 43m 52s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_09_12_24
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=4; rca=4; d=64; L=5; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
5
4519296
391296
true
out/sa=4; rca=4; d=64; L=5; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_09_12_24
RoPE
4
standard
4
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=64; L=5; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_09_12_24
Finished
awni00
4h 3m 21s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_08_30_59
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=5; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
5
4521856
393856
true
out/sa=6; rca=2; d=64; L=5; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_08_30_59
RoPE
2
standard
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=5; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_08_30_59
Finished
awni00
4h 12m 30s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_04_59_40
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=4; rca=4; d=64; L=4; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
4
4454976
326976
true
out/sa=4; rca=4; d=64; L=4; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_04_59_40
RoPE
4
standard
4
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=64; L=4; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_04_59_40
Finished
awni00
3h 39m 39s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_04_51_08
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=4; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
4
4457024
329024
true
out/sa=6; rca=2; d=64; L=4; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_04_51_08
RoPE
2
standard
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=4; rca_type=standard; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_04_51_08
Finished
awni00
14h 9m 49s
-
swiglu
false
128
0.9
0.95
false
false
128
2024_05_15_04_03_40
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=128; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
6
10114560
1890560
true
out/sa=6; rca=2; d=128; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_04_03_40
RoPE
2
disentangled_v2
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=128; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_04_03_40
Finished
awni00
11h 56m 57s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_15_03_22_22
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
6
4636032
508032
true
out/sa=6; rca=2; d=64; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_03_22_22
RoPE
2
disentangled_v2
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=6; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_15_03_22_22
Finished
awni00
15h 24m 39s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_14_22_23_33
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=4; rca=4; d=64; L=5; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE
5
4540416
412416
true
out/sa=4; rca=4; d=64; L=5; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_22_23_33
RoPE
4
disentangled_v2
4
512
sym_attn
true
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=64; L=5; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_22_23_33
Finished
awni00
15h 25m 9s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_14_18_40_49
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=4; rca=4; d=64; L=5; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
5
4560896
432896
true
out/sa=4; rca=4; d=64; L=5; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_49
RoPE
4
disentangled_v2
4
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=4; rca=4; d=64; L=5; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_49
Finished
awni00
8h 41m 7s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_14_18_40_49
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
4
4489920
361920
true
out/sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_49
RoPE
2
disentangled_v2
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_49
Finished
awni00
10h 10m 12s
-
swiglu
false
128
0.9
0.95
false
false
128
2024_05_14_18_40_39
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=128; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
4
9533568
1309568
true
out/sa=6; rca=2; d=128; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_39
RoPE
2
disentangled_v2
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=128; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_39
Finished
awni00
9h 21m 38s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_14_18_40_39
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=pos_relative; pos_enc_type=RoPE
4
4485824
357824
true
out/sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=pos_relative; pos_enc_type=RoPE__2024_05_14_18_40_39
RoPE
2
disentangled_v2
6
512
pos_relative
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=False; symbol_type=pos_relative; pos_enc_type=RoPE__2024_05_14_18_40_39
Finished
awni00
10h 18m 45s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_14_18_40_39
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=5; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE
5
4562976
434976
true
out/sa=6; rca=2; d=64; L=5; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_39
RoPE
2
disentangled_v2
6
512
sym_attn
false
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=5; rca_type=disentangled_v2; sym_rel=False; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_18_40_39
Finished
awni00
14h 12m 38s
-
swiglu
false
128
0.9
0.95
false
false
128
2024_05_14_08_10_40
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=128; L=6; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE
6
10016256
1792256
true
out/sa=6; rca=2; d=128; L=6; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_08_10_40
RoPE
2
disentangled_v2
6
512
sym_attn
true
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=128; L=6; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_14_08_10_40
Finished
awni00
3h 37m 3s
-
swiglu
false
128
0.9
0.95
false
false
128
2024_05_14_08_10_35
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=8; d=128; L=6; pos_enc_type=RoPE
6
9799936
1575936
true
out/sa=8; d=128; L=6; pos_enc_type=RoPE__2024_05_14_08_10_35
RoPE
0
NA
8
512
NA
true
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=8; d=128; L=6; pos_enc_type=RoPE__2024_05_14_08_10_35
Finished
awni00
8h 39m 32s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_13_19_23_15
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE
4
4473536
345536
true
out/sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_13_19_23_15
RoPE
2
disentangled_v2
6
512
sym_attn
true
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=True; symbol_type=sym_attn; pos_enc_type=RoPE__2024_05_13_19_23_15
Finished
awni00
9h 23m 34s
-
swiglu
false
128
0.9
0.95
false
false
64
2024_05_13_19_23_15
true
cuda
0.1
bfloat16
2000
100
false
1
2
scratch
0.001
1
100000
512
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=True; symbol_type=pos_relative; pos_enc_type=RoPE
4
4469440
341440
true
out/sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=True; symbol_type=pos_relative; pos_enc_type=RoPE__2024_05_13_19_23_15
RoPE
2
disentangled_v2
6
512
pos_relative
true
true
false
32000
llama2
true
abstract_transformer--tiny_stories-LM
sa=6; rca=2; d=64; L=4; rca_type=disentangled_v2; sym_rel=True; symbol_type=pos_relative; pos_enc_type=RoPE__2024_05_13_19_23_15
1-20
of 48