A0970601776's workspace
Runs
25
Name
4 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
_name_or_path
accelerator_config.even_batches
accelerator_config.non_blocking
accelerator_config.split_batches
accelerator_config.use_seedable_sampler
adafactor
adam_beta1
adam_beta2
adam_epsilon
add_cross_attention
architectures
attention_bias
attention_dropout
auto_find_batch_size
batch_eval_metrics
bf16
bf16_full_eval
bos_token_id
cfg
chunk_size_feed_forward
dataloader_drop_last
dataloader_num_workers
dataloader_persistent_workers
dataloader_pin_memory
ddp_timeout
debug
deepspeed
disable_tqdm
diversity_penalty
do_eval
do_predict
do_sample
do_train
early_stopping
encoder_no_repeat_ngram_size
eos_token_id
eval_delay
eval_do_concat_batches
eval_strategy
fp16
fp16_backend
fp16_full_eval
fp16_opt_level
fsdp
Finished
tw-legal-qa-single-turn,tw-emergency-medicine-bench,tw-law-article-qa,legalsign-contract-template-qa,legalsign-contract-multiround-chat,legalsign-clause-review,tw-legal-nlp,tw-judgment-qa,tw-bar-examination-2020-chat,legalsign-identity
a0970601776
1h 16m 47s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
true
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Crashed
tw-legal-qa-single-turn,tw-emergency-medicine-bench,tw-law-article-qa,legalsign-contract-template-qa,legalsign-contract-multiround-chat,legalsign-clause-review,tw-legal-nlp,tw-judgment-qa,tw-bar-examination-2020-chat,legalsign-identity
a0970601776
38m 31s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
true
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Crashed
tw-legal-qa-single-turn,tw-emergency-medicine-bench,tw-law-article-qa,legalsign-contract-template-qa,legalsign-contract-multiround-chat,legalsign-clause-review,tw-legal-nlp,tw-judgment-qa,tw-bar-examination-2020-chat,legalsign-identity
a0970601776
46m 1s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
true
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Finished
-
a0970601776
1h 16m 39s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Crashed
-
a0970601776
4s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
a0970601776
1h 14m 6s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Finished
-
a0970601776
37m 16s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Crashed
-
a0970601776
5m 21s
-
/workspace/LLaMA-Factory/model/Llama-3-Taiwan-8B-Instruct
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Finished
-
a0970601776
35m 56s
-
/workspace/LLaMA-Factory/model/Justitia-8B
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Finished
-
a0970601776
21m 42s
-
/workspace/LLaMA-Factory/model/Justitia-8B
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Finished
-
a0970601776
21m 57s
-
/workspace/LLaMA-Factory/model/Justitia-8B
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Crashed
-
a0970601776
4m 6s
-
/workspace/LLaMA-Factory/model/Justitia-8B
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Crashed
-
a0970601776
4m 21s
-
/workspace/LLaMA-Factory/model/Justitia-8B
true
false
false
true
false
0.9
0.999
1.0000e-8
false
["LlamaForCausalLM"]
false
0
false
false
true
false
128000
-
0
false
0
false
true
1800
[]
/workspace/LLaMA-Factory/examples/deepspeed/ds_z1_config.json
false
0
false
false
false
true
false
0
128009
0
true
no
false
auto
false
O1
[]
Finished
-
a0970601776
21h 41m 9s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5000, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [8474, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/combined_dataset.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
a0970601776
2h 1m 37s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Failed
-
a0970601776
3m 21s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'freeze_updates': {'enabled': True, 'modules': {'joint': [10, 500]}}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Failed
-
a0970601776
1m 57s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'freeze_updates': {'enabled': True, 'modules': {'decoder': 200}}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Failed
-
a0970601776
1m 58s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'freeze_updates': {'enabled': True, 'modules': {'encoder': 200}}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Finished
-
a0970601776
2h 1m 8s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [467, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/law-ctx.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/Llama-3-Taiwan-8B-Instruct-DPO.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
-
a0970601776
20h 29m 11s
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 64, 'rampup_batch_size': None, 'tensor_model_parallel_size': 4, 'pipeline_model_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'context_parallel_size': 1, 'encoder_seq_length': 8192, 'max_position_embeddings': 8192, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 14336, 'num_attention_heads': 32, 'num_query_groups': 8, 'init_method_std': 0.01, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'tokenizer': {'library': 'huggingface', 'type': 'meta-llama/Meta-Llama-3-8B', 'use_fast': True, 'model': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/Llama-3-Taiwan-8B-Instruct-DPO/tokenizer.json'}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': True, 'bias_activation_fusion': True, 'bias_dropout_add_fusion': True, 'masked_softmax_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': None, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': False, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'ub_tp_comm_overlap': False, 'use_flash_attention': True, 'gc_interval': 100, 'nsys_profile': {'enabled': False, 'trace': ['nvtx', 'cuda'], 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'bucket_cap_mb': 125, 'overlap_grad_sync': True, 'overlap_param_sync': True, 'contiguous_grad_buffer': True, 'contiguous_param_buffer': True, 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 200000, 'constant_steps': 0, 'min_lr': 1e-05}}, 'data': {'data_impl': 'mmap', 'splits_string': '9990,8,2', 'seq_length': 8192, 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'index_mapping_dir': None, 'data_prefix': [466, '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/data/custom_dataset/preprocessed/datasets.jsonl_text_document']}, 'seq_len_interpolation_factor': None, 'restore_from_path': '/mnt/home/owen/NeMo-Framework-Launcher/launcher_scripts/results/Llama-3-Taiwan-8B-Instruct-DPO/results/checkpoints/megatron_llama.nemo', 'precision': 'bf16-mixed'}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
1-20
of 25