a0970601776

A0970601776's workspace

Runs

Finished

tw-legal-qa-single-turn,tw-emergency-medicine-bench,tw-law-article-qa,legalsign-contract-template-qa,legalsign-contract-multiround-chat,legalsign-clause-review,tw-legal-nlp,tw-judgment-qa,tw-bar-examination-2020-chat,legalsign-identity

a0970601776

1y ago

1h 16m 47s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

true

false

128009

true

false

auto

false

[]

Crashed

a0970601776

1y ago

38m 31s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

true

false

128009

true

false

auto

false

[]

Crashed

a0970601776

1y ago

46m 1s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

true

false

128009

true

false

auto

false

[]

Finished

a0970601776

1y ago

1h 16m 39s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Crashed

a0970601776

1y ago

Finished

a0970601776

1y ago

1h 14m 6s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Finished

a0970601776

1y ago

37m 16s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Crashed

a0970601776

1y ago

5m 21s

/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Finished

a0970601776

1y ago

35m 56s

/workspace/LLaMA-Factory/model/Justitia-8B

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Finished

a0970601776

1y ago

21m 42s

/workspace/LLaMA-Factory/model/Justitia-8B

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Finished

a0970601776

1y ago

21m 57s

/workspace/LLaMA-Factory/model/Justitia-8B

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Crashed

a0970601776

1y ago

4m 6s

/workspace/LLaMA-Factory/model/Justitia-8B

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Crashed

a0970601776

1y ago

4m 21s

/workspace/LLaMA-Factory/model/Justitia-8B

true

false

true

false

0.9

0.999

1.0000e-8

false

["LlamaForCausalLM"]

false

true

false

128000

false

true

1800

[]

/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json

false

true

false

128009

true

false

auto

false

[]

Finished

a0970601776

1y ago

21h 41m 9s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5000, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [8474, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/combined_dataset.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}

Finished

a0970601776

1y ago

2h 1m 37s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}

Failed

a0970601776

1y ago

3m 21s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'freeze_updates': {'enabled': True, 'modules': {'joint': [10, 500]}}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}

Failed

a0970601776

1y ago

1m 57s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'freeze_updates': {'enabled': True, 'modules': {'decoder': 200}}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}

Failed

a0970601776

1y ago

1m 58s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'freeze_updates': {'enabled': True, 'modules': {'encoder': 200}}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}

Finished

a0970601776

1y ago

2h 1m 8s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}

Crashed

a0970601776

1y ago

20h 29m 11s

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 200000, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [466, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/datasets.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/results/Llama-3-Taiwan-8B-Instruct-DPO/results/checkpoints/megatron_llama.nemo', 'precision': 'bf16-mixed'}

1-20

of 25