Apiche's workspace
Runs
420
Name
3 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
_cpu
_mixed_precision
actor.chunk_size
actor.sampling.method
actor.submit_delay
actor.threads_per_llm
agent._target_
agent.max_prompt_length
also_save_steps
attempts
attn_implementation
auto_device_map
backend
config_name
cuda_empty_cache
debug.mode
debug.place_inference_workers
debug
deepspeed_config
deepspeed_plugins
device
discount_factor
distributed_type
dynamo_plugin
eval_callback._target_
eval_callback.config_name
eval_every_n_versions
finetune.also_save_steps
finetune.attn_implementation
finetune.auto_device_map
finetune.config_name
finetune.cuda_empty_cache
finetune.eval_callback._target_
finetune.eval_callback.config_name
finetune.force_restart
finetune.gradient_accumulation_passes
finetune.gradient_checkpointing
finetune.gradient_clipping_threshold
finetune.input
finetune.interrupt_train_steps
finetune.keep_intermediate_checkpoints
finetune.learning_rate
finetune.load_as_bf16
finetune.log_each_n_steps
Crashed
apiche
2h 21m 54s
-
False
no
-
-
-
-
-
-
-
8
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc24530f50>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-7B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
1m 1s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246d6b50>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-7B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
4m 43s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc24530cd0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-7B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
25m 22s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb053d990>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-7B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
4m 20s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb4261f10>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
1h 5m 50s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246daa90>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
1h 41m 26s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246e1fd0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
2h 48m 52s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246da510>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
21m 29s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc2452fad0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
Killed
apiche
1h 15m 46s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc2452e750>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
18m 17s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246deb90>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
7m 46s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc245cd650>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
25m 44s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc24762050>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
57m 16s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246ef2d0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
12m 19s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc2453b190>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
17m 24s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246dc250>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
34m 28s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffc246d65d0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
0
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
5m 37s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb053f8d0>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
45m 43s
-
False
no
-
-
-
-
-
-
-
1
-
-
nccl
-
-
true
False
deepspeed_stage3_bf16
DeepSpeedPlugin(hf_ds_config=<accelerate.utils.deepspeed.HfDeepSpeedConfig object at 0x7ffcb076d950>, gradient_accumulation_steps=1, gradient_clipping='auto', zero_stage=3, is_train_batch_min=True, offload_optimizer_device='none', offload_param_device='none', offload_optimizer_nvme_path='none', offload_param_nvme_path='none', zero3_init_flag=True, zero3_save_16bit_model=True, transformer_moe_cls_names=None, enable_msamp=False, msamp_opt_level='O1')
cuda:0
-
DistributedType.DEEPSPEED
TorchDynamoPlugin(backend=<DynamoBackend.NO: 'NO'>, mode='default', fullgraph=False, dynamic=None, options=None, disable=False, use_regional_compilation=False)
-
-
78000
-
flash_attention_2
false
Qwen/Qwen2.5-0.5B
true
tapeagents.finetune.eval.dummy_eval_callback
true
1024
true
0.3
training_data
-1
true
0.000001
true
1
Crashed
apiche
33m 29s
-
-
-
-
-
-
-
-
-
-
1
-
-
-
-
-
actor
true
-
deepspeed_stage3_bf16
-
-
-
-
-
-
-
78000
-
flash_attention_2
false
/mnt/llmd/base_models/Mistral-Small-24B-Base-2501/
true
tapeagents.finetune.eval.dummy_eval_callback
true
16
true
0.3
training_data
-1
true
0.000001
true
1
1-20
of 175