Eliebak's workspace
Runs
2
State
Notes
User
Tags
Created
Runtime
Sweep
nanotron_config.checkpoints.checkpoint_interval
nanotron_config.checkpoints.checkpoints_path
nanotron_config.checkpoints.checkpoints_path_is_shared_file_system
nanotron_config.checkpoints.load_lr_scheduler
nanotron_config.checkpoints.load_optimizer
nanotron_config.checkpoints.save_final_state
nanotron_config.checkpoints.save_initial_state
nanotron_config.data_stages
nanotron_config.general.ignore_sanity_checks
nanotron_config.general.project
nanotron_config.general.run
nanotron_config.general.seed
nanotron_config.logging.iteration_step_info_interval
nanotron_config.logging.log_level
nanotron_config.logging.log_level_replica
nanotron_config.metrics_logging.log_detail_interval
nanotron_config.metrics_logging.log_level
nanotron_config.model.ddp_bucket_cap_mb
nanotron_config.model.dtype
nanotron_config.model.init_method.scaling_method
nanotron_config.model.init_method.std
nanotron_config.model.make_vocab_size_divisible_by
nanotron_config.model.model_config._attn_implementation
nanotron_config.model.model_config._fused_rms_norm
nanotron_config.model.model_config._fused_rotary_emb
nanotron_config.model.model_config._use_doc_masking
nanotron_config.model.model_config._use_qkv_packed
nanotron_config.model.model_config.attention_bias
nanotron_config.model.model_config.bos_token_id
nanotron_config.model.model_config.eos_token_id
nanotron_config.model.model_config.hidden_act
nanotron_config.model.model_config.hidden_size
nanotron_config.model.model_config.initializer_range
nanotron_config.model.model_config.intermediate_size
nanotron_config.model.model_config.is_qwen2_config
nanotron_config.model.model_config.log_attn_probs
nanotron_config.model.model_config.max_position_embeddings
nanotron_config.model.model_config.no_rope_layer
nanotron_config.model.model_config.num_attention_heads
nanotron_config.model.model_config.num_hidden_layers
nanotron_config.model.model_config.num_key_value_heads
nanotron_config.model.model_config.pretraining_tp
nanotron_config.model.model_config.rms_norm_eps
nanotron_config.model.model_config.rope_interleaved
Finished
eliebak
6h 6m 23s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"dataset":{"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/elie_bakouch/fineweb-edu","/scratch/elie_bakouch/finemath"],"dataset_weights":[0.8,0.2],"return_positions":true,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","pad_samples_to_global_batch_size":false,"skip_in_stream":false,"token_size_in_bytes":4},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1}]
true
boomtitan-nanotron
elie-4n-1608
6
1
info
info
200
1
50
bfloat16
NUM_LAYERS
0.02
1
flash_attention_2
true
true
true
true
false
128000
128001
silu
2048
0.02
11008
true
true
4096
4
16
36
4
2
0.000001
false
Finished
eliebak
6h 6m 22s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"dataset":{"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_weights":[0.8,0.2],"return_positions":true,"token_size_in_bytes":4,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/elie_bakouch/fineweb-edu","/scratch/elie_bakouch/finemath"],"pad_samples_to_global_batch_size":false,"shuffle_files":false},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1}]
true
boomtitan-nanotron
elie-4n-1608
6
1
info
info
200
1
50
bfloat16
NUM_LAYERS
0.02
1
flash_attention_2
true
true
true
true
false
128000
128001
silu
2048
0.02
11008
true
true
4096
4
16
36
4
2
0.000001
false
1-2
of 2