Bgiddwani's workspace
Runs
17
Name
3 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
cfg
consumed_samples
epoch
global_step
grad_norm
lr
reduced_train_loss
train_backward_timing in s
train_step_timing in s
trainer/global_step
val_loss
validation_step_timing in s
Failed
-
bgiddwani
23s
-
-
-
-
-
-
-
-
-
-
-
-
-
Killed
-
bgiddwani
13m 58s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
3200
0
99
2.06038
0.000099991
6.27243
0.000043631
2.03192
99
6.26704
0.00013375
Killed
-
bgiddwani
4m 8s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
1600
0
49
1.19257
0.000099998
6.90204
0.000051022
0.71139
49
6.84033
0.00016737
Finished
-
bgiddwani
9m 42s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
3040
0
94
1.70458
0.000099992
6.34246
0.000051737
2.04157
94
7.00326
0.00016856
Finished
-
bgiddwani
2h 46m 9s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
35200
0
1099
0.49507
0.000098809
3.29885
0.000049353
0.71741
723
3.33524
0.16081
Finished
-
bgiddwani
6h 31m 23s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
79520
0
2484
0.41226
0.000093996
2.96546
0.000041962
2.05869
2484
2.96196
0.00014114
Crashed
-
bgiddwani
4h 18m 3s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
52800
0
1649
0.47008
0.000097325
3.14774
0.000050068
0.90768
1649
3.11995
0.00015998
Crashed
-
bgiddwani
33s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/indiccorp-v2-tokenizer/tokenizer/hi-indicorpv2-tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/indiccorpv2-hindi/mmap/indicorps-v2-hindi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
160
0
4
206.34572
0.00008
15.03847
0.000060558
0.90131
4
-
-
Failed
-
bgiddwani
12s
-
-
-
-
-
-
-
-
-
-
-
-
-
Crashed
-
bgiddwani
2h 9m 32s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': True, 'fp8_e4m3': False, 'fp8_hybrid': True, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
38400
0
1199
0.34542
0.000098582
0.22595
0.000047207
1.01012
1199
5.83812
0.00015497
Crashed
-
bgiddwani
7h 1m 34s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
128000
0
3999
0.14109
0.000084974
0.17239
0.000041962
1.97204
3999
6.73109
0.00012231
Crashed
-
bgiddwani
5s
-
-
-
-
-
-
-
-
-
-
-
-
-
Killed
-
bgiddwani
14m 11s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
4672
0
145
1.32495
0.00009998
6.03586
0.000041723
1.97885
148
5.95245
0.0001421
Finished
-
bgiddwani
8m 36s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
2080
0
64
2.31496
0.000099997
6.9483
0.000043392
1.96347
64
7.05982
0.00013685
Crashed
-
bgiddwani
1h 18m 31s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
17600
0
549
1.12887
0.000099705
1.23728
0.000046492
0.91531
549
4.84837
0.00015807
Finished
-
bgiddwani
40s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
640
0
19
5.244
0.0001
8.18174
0.000047922
0.89674
19
-
-
Crashed
-
bgiddwani
27m 20s
-
{'mcore_gpt': True, 'micro_batch_size': 4, 'global_batch_size': 32, 'tensor_model_parallel_size': 2, 'pipeline_model_parallel_size': 1, 'virtual_pipeline_model_parallel_size': None, 'encoder_seq_length': 2048, 'max_position_embeddings': 2048, 'num_layers': 32, 'hidden_size': 4096, 'ffn_hidden_size': 11008, 'num_attention_heads': 32, 'init_method_std': 0.02, 'use_scaled_init_method': True, 'hidden_dropout': 0.0, 'attention_dropout': 0.0, 'ffn_dropout': 0.0, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'normalization': 'rmsnorm', 'layernorm_epsilon': 1e-05, 'do_layer_norm_weight_decay': False, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'bias': False, 'activation': 'fast-swiglu', 'headscale': False, 'transformer_block_type': 'pre_ln', 'openai_gelu': False, 'normalize_attention_scores': True, 'position_embedding_type': 'rope', 'rotary_percentage': 1.0, 'attention_type': 'multihead', 'share_embeddings_and_output_weights': False, 'overlap_p2p_comm': False, 'batch_p2p_comm': True, 'num_query_groups': 32, 'tokenizer': {'library': 'sentencepiece', 'type': None, 'model': '/workspace/pretrain_local/scripts/hi_tokenizer.model', 'vocab_file': None, 'merge_file': None, 'delimiter': None, 'sentencepiece_legacy': False}, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': True, 'grad_allreduce_chunk_size_mb': 125, 'grad_div_ar_fusion': True, 'gradient_accumulation_fusion': False, 'bias_activation_fusion': False, 'bias_dropout_add_fusion': False, 'masked_softmax_fusion': True, 'get_attention_mask_from_fusion': True, 'seed': 1234, 'resume_from_checkpoint': None, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'sync_batch_comm': False, 'activations_checkpoint_granularity': 'selective', 'activations_checkpoint_method': 'uniform', 'activations_checkpoint_num_layers': None, 'num_micro_batches_with_partial_activation_checkpoints': None, 'activations_checkpoint_layers_per_pipeline': None, 'sequence_parallel': True, 'transformer_engine': True, 'fp8': False, 'fp8_e4m3': False, 'fp8_hybrid': False, 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'reduce_amax': True, 'use_emha': False, 'data': {'data_prefix': [1.0, '/workspace/pretrain_local/data/hi_text_document'], 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,8,2', 'seq_length': '${model.encoder_seq_length}', 'skip_warmup': True, 'num_workers': 0, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'validation_drop_last': True, 'no_seqlen_plus_one_input_tokens': False, 'pad_samples_to_global_batch_size': False, 'shuffle_documents': True}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'optim': {'name': 'distributed_fused_adam', 'lr': 0.0001, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 5, 'constant_steps': 90, 'min_lr': 1e-05}}, 'precision': 'bf16-mixed'}
6400
0
199
1.24389
0.000099962
5.47413
0.000043392
1.97811
199
5.52167
0.0001297
1-17
of 17