Apiche's workspace
Runs
362
Name
3 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
_cpu
_mixed_precision
actor.chunk_size
actor.sampling.method
actor.submit_delay
actor.threads_per_llm
agent._target_
agent.max_prompt_length
also_save_steps
attempts
attn_implementation
auto_device_map
backend
config_name
cuda_empty_cache
debug.mode
debug.place_inference_workers
debug
deepspeed_config
deepspeed_plugins
device
discount_factor
distributed_type
dynamo_plugin
eval_callback._target_
eval_callback.config_name
eval_every_n_versions
finetune.also_save_steps
finetune.attn_implementation
finetune.auto_device_map
finetune.config_name
finetune.cuda_empty_cache
finetune.eval_callback._target_
finetune.eval_callback.config_name
finetune.force_restart
finetune.gradient_accumulation_passes
finetune.gradient_checkpointing
finetune.gradient_clipping_threshold
finetune.input
finetune.interrupt_train_steps
finetune.keep_intermediate_checkpoints
finetune.learning_rate
finetune.load_as_bf16
finetune.log_each_n_steps
Crashed
-
apiche
3h 39m 11s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2b4a050>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
3m 48s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2c18990>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
1m 35s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2c17d50>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
5m 26s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2c29350>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
7m 32s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb3c0d890>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
3m 46s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb3c0d4d0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
2m 10s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2b4b010>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
64
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
3m 17s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2dd14d0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
512
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
2m 1s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2b49450>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
512
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
8m 45s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb2dd5610>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
512
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
1h 8m 53s
-
-
-
-
-
-
-
-
-
-
1
-
-
-
-
-
actor
true
-
deepspeed_stage3_bf16
-
-
-
-
-
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
512
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
27m 7s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb0758310>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
1m 31s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb083aa50>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
2m 4s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb0747910>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
22m 31s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb08e31d0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
46s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb08e0e90>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
4m 46s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb0829210>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
34s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb085aa90>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
4m 55s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb063eed0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
-
apiche
50s
-
False
no
-
-
-
-
-
-
-
8
-
-
nccl
-
-
finetune+preprocessor+sft
false
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb063eb10>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen3-4B-Instruct-2507
true
tapeagents.finetune.eval.dummy_eval_callback
false
4
true
0.3
training_data
-1
true
0.000001
true
1
1-20
of 155