rafi-personal

Rdoublea's workspace

Runs

154

•

Crashed

Add notes...

rdoublea

8mo ago

21m 46s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-17B/

true

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup

100

1000

torchtune.training.metric_logging.WandBLogger

/tmp/lora-llama4-finetune

torchtune.models.llama4.lora_llama4_17bx16

true

false

["q_proj","v_proj","output_proj"]

torch.optim.AdamW

0.00002

false

/tmp/lora-llama4-finetune

torchtune.training.setup_torch_profiler

true

Killed

Add notes...

rdoublea

8mo ago

20m 12s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-17B/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup

100

1000

torchtune.training.metric_logging.WandBLogger

/tmp/lora-llama4-finetune

torchtune.models.llama4.lora_llama4_17bx16

true

false

["q_proj","v_proj","output_proj"]

torch.optim.AdamW

0.00002

false

/tmp/lora-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 29s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-17B/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/lora-llama4-finetune

torchtune.models.llama4.lora_llama4_17bx16

true

false

["q_proj","v_proj","output_proj"]

torch.optim.AdamW

0.00002

false

/tmp/lora-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-17B/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/lora-llama4-finetune

torchtune.models.llama4.lora_llama4_17bx16

true

false

["q_proj","v_proj","output_proj"]

torch.optim.AdamW

0.00002

false

/tmp/lora-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 48s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-17B/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/lora-llama4-finetune

torchtune.models.llama4.lora_llama4_17bx16

true

false

["q_proj","v_proj","output_proj"]

torch.optim.AdamW

0.00002

false

/tmp/lora-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

59s

torchtune.training.FullModelMetaCheckpointer

/home/jessicazhong/pci-wsf/jessicazhong/checkpoints/20m_moe_svt_mp1pp1/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/torchtune/llama4_20Mx8/full

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/torchtune/llama4_20Mx8/full/logs

torchtune.models.llama4.llama4_20mx8

torch.optim.AdamW

0.0002

true

/tmp/torchtune/llama4_20Mx8/full

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

torchtune.training.FullModelMetaCheckpointer

/home/jessicazhong/pci-wsf/jessicazhong/checkpoints/20m_moe_svt_mp1pp1/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_20mx8

torch.optim.AdamW

0.0002

true

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Killed

Add notes...

rdoublea

8mo ago

27s

torchtune.training.FullModelMetaCheckpointer

/home/jessicazhong/pci-wsf/jessicazhong/checkpoints/20m_moe_svt_mp1pp1/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_20mx8

torch.optim.AdamW

0.0002

true

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

26s

torchtune.training.FullModelMetaCheckpointer

/home/jessicazhong/pci-wsf/jessicazhong/checkpoints/20m_moe_svt_mp1pp1/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_20mx8

torch.optim.AdamW

0.0002

true

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 29s

torchtune.training.FullModelMetaCheckpointer

/tmp/Llama-4-20M-MOE/epoch_0

["ft-model-00001-of-00001.bin"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 34s

torchtune.training.FullModelMetaCheckpointer

/tmp/Llama-4-20M-MOE/epoch_0

["ft-model-00001-of-00001.bin"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

4m 55s

torchtune.training.FullModelMetaCheckpointer

/tmp/Llama-4-20M-MOE/epoch_0

["ft-model-00001-of-00001.bin"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Crashed

Add notes...

rdoublea

8mo ago

7m 31s

torchtune.training.FullModelMetaCheckpointer

/tmp/Llama-4-20M-MOE/epoch_0

["ft-model-00001-of-00001.bin"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 22s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-17B/

false

torchtune.datasets.alpaca_dataset

false

cuda

bf16

false

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/lora-llama4-finetune

torchtune.models.llama4.lora_llama4_17bx16

true

false

["q_proj","v_proj","output_proj"]

torch.optim.AdamW

0.00002

false

/tmp/lora-llama4-finetune

torchtune.training.setup_torch_profiler

true

Crashed

Add notes...

rdoublea

8mo ago

24m 13s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

9m 46s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 26s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 23s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Failed

Add notes...

rdoublea

8mo ago

1m 24s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

Killed

Add notes...

rdoublea

8mo ago

19s

torchtune.training.FullModelMetaCheckpointer

/home/rafiayub/checkpoints/17b_moe_svt_mp1pp1_non_te/

["consolidated_with_vision_and_speech_encoder_weights.00.pth"]

LLAMA4

/tmp/Llama-4-20M-MOE/

false

torchtune.datasets.multimodal.librispeech_asr_dataset

false

cuda

bf16

true

torchtune.modules.loss.CEWithChunkedOutputLoss

1000

torchtune.training.metric_logging.WandBLogger

/tmp/full-llama4-finetune

torchtune.models.llama4.llama4_17bx16

torch.optim.AdamW

0.00002

false

/tmp/full-llama4-finetune

torchtune.training.setup_torch_profiler

true

1-20

of 154