Upup-ashton-wang's workspace
Runs
47
Name
9 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
base_model_name
batch_size
host_model_checkpoint
host_model_post_train_dataset_name
host_model_post_train_type
learning_rate
logging_steps
lora_alpha
lora_dropout
lora_r
lora_target_modules
sae_hookpoint
sae_name
save_steps
seed
num_epochs
if_nondistill
non_distill_base_model_name
nondistill_base_model_name
distill_type
student_model_name
_attn_implementation_autoset
_name_or_path
accelerator_config.even_batches
accelerator_config.non_blocking
accelerator_config.split_batches
accelerator_config.use_seedable_sampler
adafactor
adam_beta1
adam_beta2
adam_epsilon
add_cross_attention
architectures
attention_dropout
auto_find_batch_size
average_tokens_across_devices
batch_eval_metrics
bf16
bf16_full_eval
bos_token_id
chars_per_token
chunk_size_feed_forward
dataloader_drop_last
dataloader_num_workers
Finished
-
upup-ashton-wang
15d 21h 36m 17s
-
-
-
-
-
-
0.00001
1
-
-
-
-
-
-
500
42
-
-
-
-
-
-
true
["/home/omer/shangshang/project/reasoning/reasoning-sae/ckpts/models/DeepSeek-R1-Distill-Qwen-1.5B/base","/project/neiswang_1391/shangsha/reasoning/reasoning-sae/ckpts/models/DeepSeek-R1-Distill-Qwen-1.5B/base"]
true
false
false
true
false
0.9
0.999
1.0000e-8
false
Qwen2ForCausalLM
0
false
false
false
true
false
151643
<CHARS_PER_TOKEN>
0
false
0
Finished
-
upup-ashton-wang
3mo 7d 20h 42m 38s
-
DeepSeek-R1-Distill-Qwen-1.5B
1
["checkpoint-0","checkpoint-1000","checkpoint-2000"]
["curated_deepscaler","curated_still"]
grpo
0.000001
1
128
0.05
32
["down_proj","gate_proj","k_proj","o_proj","q_proj","up_proj","v_proj"]
["layers.12.mlp","model.layers.12"]
sae-DeepSeek-R1-Distill-Qwen-1.5B-65k
500
42
2.11111
-
-
-
sft_r1_distill
DeepSeek-R1-Distill-Qwen-1.5B
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
upup-ashton-wang
12h 17m 17s
-
DeepSeek-R1-Distill-Qwen-1.5B
1
["checkpoint-1","checkpoint-10","checkpoint-100","checkpoint-50"]
curated_still
grpo
0.000001
1
128
0.05
32
["down_proj","gate_proj","k_proj","o_proj","q_proj","up_proj","v_proj"]
model.layers.12
sae-DeepSeek-R1-Distill-Qwen-1.5B-65k
500
42
2
-
-
-
sft_r1_distill
DeepSeek-R1-Distill-Qwen-1.5B
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
upup-ashton-wang
10d 15h 43m 6s
-
DeepSeek-R1-Distill-Qwen-1.5B
1
checkpoint-2000
curated_still
grpo
0.000001
1
128
0.05
32
["down_proj","gate_proj","k_proj","o_proj","q_proj","up_proj","v_proj"]
model.layers.12
sae-DeepSeek-R1-Distill-Qwen-1.5B-65k
500
42
2
-
-
-
sft_r1_distill
DeepSeek-R1-Distill-Qwen-1.5B
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
upup-ashton-wang
7h 38m 7s
-
DeepSeek-R1-Distill-Qwen-1.5B
1
["checkpoint-1","checkpoint-10","checkpoint-100","checkpoint-3000","checkpoint-50","checkpoint-500"]
curated_still
grpo
0.000001
1
128
0.05
32
["down_proj","gate_proj","k_proj","o_proj","q_proj","up_proj","v_proj"]
model.layers.12
sae-DeepSeek-R1-Distill-Qwen-1.5B-65k
500
42
2
-
-
-
sft_r1_distill
DeepSeek-R1-Distill-Qwen-1.5B
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
upup-ashton-wang
1d 12h 20m 2s
-
DeepSeek-R1-Distill-Qwen-1.5B
1
checkpoint-2000
curated_still
grpo
0.000001
1
128
0.05
32
["down_proj","gate_proj","k_proj","o_proj","q_proj","up_proj","v_proj"]
["model.layers.16","model.layers.17","model.layers.18","model.layers.20","model.layers.21","model.layers.22","model.layers.23","model.layers.24","model.layers.25","model.layers.26"]
sae-DeepSeek-R1-Distill-Qwen-1.5B-65k
500
42
2.08696
false
-
Qwen2.5-Math-1.5B
sft_r1_distill
DeepSeek-R1-Distill-Qwen-1.5B
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
1-6
of 6