Aflah's workspace
Runs
160
Name
8 visualized
seq_length: 1024
seq_length: 1024
1
32
seq_length: 2048
seq_length: 2048
2
128
dataset: FW_Edu
dataset: FW_Edu
2
100
model: pythia
model: pythia
2
68
pos_emb: none
pos_emb: none
1
40
rotary_pct: 1
rotary_pct: 1
4
40
seed: 1234
seed: 1234
3
24
lr: 0.0004
lr: 0.0004
2
16
use_qk_layernorm: true
use_qk_layernorm: true
1
4
use_qk_layernorm: false
use_qk_layernorm: false
2
12
pipe_parallel_size: 0
pipe_parallel_size: 0
2
8
log_grad_norm: false
log_grad_norm: false
4
log_grad_norm: true
log_grad_norm: true
4
pipe_parallel_size: 1
pipe_parallel_size: 1
1
4
log_grad_norm: false
log_grad_norm: false
4
lr: 0.00004
lr: 0.00004
1
4
lr: 0.004
lr: 0.004
1
4
seed: 238
seed: 238
1
8
seed: 953
seed: 953
1
4
seed: 175
seed: 175
1
4
pos_emb: rotary
pos_emb: rotary
7
28
model: llama
model: llama
2
32
dataset: FW
dataset: FW
1
28
1-2
of 2train/learning_rate
train/learning_rate
seq_length: 2048, dataset: FW_Edu, model: pythia, pos_emb: none, rotary_pct: 1, seed: 1234, lr: 0.0004, use_qk_layernorm: false, pipe_parallel_size: 0, log_grad_norm: false
seq_length: 2048, dataset: FW_Edu, model: pythia, pos_emb: none, rotary_pct: 1, seed: 1234, lr: 0.0004, use_qk_layernorm: false, pipe_parallel_size: 0, log_grad_norm: true