bgiddwani

Failed

-

bgiddwani

2y ago

23s

-

Killed

-

bgiddwani

2y ago

13m 58s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

3200

0

99

2.06038

0.000099991

6.27243

0.000043631

2.03192

99

6.26704

0.00013375

Killed

-

bgiddwani

2y ago

4m 8s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

1600

0

49

1.19257

0.000099998

6.90204

0.000051022

0.71139

49

6.84033

0.00016737

Finished

-

bgiddwani

2y ago

9m 42s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

3040

0

94

1.70458

0.000099992

6.34246

0.000051737

2.04157

94

7.00326

0.00016856

Finished

-

bgiddwani

2y ago

2h 46m 9s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

35200

0

1099

0.49507

0.000098809

3.29885

0.000049353

0.71741

723

3.33524

0.16081

Finished

-

bgiddwani

2y ago

6h 31m 23s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

79520

0

2484

0.41226

0.000093996

2.96546

0.000041962

2.05869

2484

2.96196

0.00014114

Crashed

-

bgiddwani

2y ago

4h 18m 3s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

52800

0

1649

0.47008

0.000097325

3.14774

0.000050068

0.90768

1649

3.11995

0.00015998

Crashed

-

bgiddwani

2y ago

33s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

160

0

4

206.34572

0.00008

15.03847

0.000060558

0.90131

4

-

Failed

-

bgiddwani

2y ago

12s

-

Crashed

-

bgiddwani

2y ago

2h 9m 32s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

38400

0

1199

0.34542

0.000098582

0.22595

0.000047207

1.01012

1199

5.83812

0.00015497

Crashed

-

bgiddwani

2y ago

7h 1m 34s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

128000

0

3999

0.14109

0.000084974

0.17239

0.000041962

1.97204

3999

6.73109

0.00012231

Crashed

-

bgiddwani

2y ago

5s

-

Killed

-

bgiddwani

2y ago

14m 11s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

4672

0

145

1.32495

0.00009998

6.03586

0.000041723

1.97885

148

5.95245

0.0001421

Finished

-

bgiddwani

2y ago

8m 36s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

2080

0

64

2.31496

0.000099997

6.9483

0.000043392

1.96347

64

7.05982

0.00013685

Crashed

-

bgiddwani

2y ago

1h 18m 31s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

17600

0

549

1.12887

0.000099705

1.23728

0.000046492

0.91531

549

4.84837

0.00015807

Finished

-

bgiddwani

2y ago

40s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

640

0

19

5.244

0.0001

8.18174

0.000047922

0.89674

19

-

Crashed

-

bgiddwani

2y ago

27m 20s

-

{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}

6400

0

199

1.24389

0.000099962

5.47413

0.000043392

1.97811

199

5.52167

0.0001297

Bgiddwani's workspace