Amir-mahla's workspace
Runs
29
Name
28 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
nanotron_config.checkpoints.checkpoint_interval
nanotron_config.checkpoints.checkpoints_path
nanotron_config.checkpoints.checkpoints_path_is_shared_file_system
nanotron_config.checkpoints.load_lr_scheduler
nanotron_config.checkpoints.load_optimizer
nanotron_config.checkpoints.resume_checkpoint_path
nanotron_config.checkpoints.save_final_state
nanotron_config.checkpoints.save_initial_state
nanotron_config.data_stages
nanotron_config.general.ignore_sanity_checks
nanotron_config.general.project
nanotron_config.general.run
nanotron_config.general.seed
nanotron_config.lighteval.batch_size
nanotron_config.lighteval.eval_config_override
nanotron_config.lighteval.eval_interval
nanotron_config.lighteval.eval_interval_file
nanotron_config.lighteval.local_checkpoint_dir
nanotron_config.lighteval.logs_path
nanotron_config.lighteval.nanotron_path
nanotron_config.lighteval.output_dir
nanotron_config.lighteval.parallelism.context_parallel_size
nanotron_config.lighteval.parallelism.dp
nanotron_config.lighteval.parallelism.expert_parallel_size
nanotron_config.lighteval.parallelism.moe_layer_recompute
nanotron_config.lighteval.parallelism.pp
nanotron_config.lighteval.parallelism.pp_engine
nanotron_config.lighteval.parallelism.recompute_layer
nanotron_config.lighteval.parallelism.tp
nanotron_config.lighteval.parallelism.tp_linear_async_communication
nanotron_config.lighteval.parallelism.tp_mode
nanotron_config.lighteval.parallelism.tp_recompute_allgather
nanotron_config.lighteval.s3_save_path
nanotron_config.lighteval.slurm.cpus_per_task
nanotron_config.lighteval.slurm.gpus_per_node
nanotron_config.lighteval.slurm.hf_cache
nanotron_config.lighteval.slurm.partition
nanotron_config.lighteval.slurm.qos
nanotron_config.lighteval.slurm.reservation
nanotron_config.lighteval.slurm.time
nanotron_config.lighteval.slurm_script_dir
nanotron_config.lighteval.upload_to_wandb
nanotron_config.lighteval.wandb_entity
nanotron_config.lighteval.wandb_project
Finished
loubnabnl
3h 11m 13s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"name":"stable","sequence_length":4096,"start_training_step":1,"data":{"dataset":{"shuffle_files":false,"skip_in_stream":false,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"return_positions":true},"num_loading_workers":0,"seed":6}},{"sequence_length":4096,"start_training_step":3450001,"data":{"dataset":{"token_size_in_bytes":4,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"return_positions":true,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false,"skip_in_stream":false},"num_loading_workers":0,"seed":6},"name":"stable stage 2"},{"data":{"dataset":{"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"pad_samples_to_global_batch_size":false,"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_max_tokens":null,"shuffle_files":false,"use_old_brrr_dataloader":false},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
19h 11m 16s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"dataset":{"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032]},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1},{"start_training_step":3450001,"data":{"num_loading_workers":0,"seed":6,"dataset":{"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"return_positions":true,"shuffle_files":false}},"name":"stable stage 2","sequence_length":4096},{"sequence_length":4096,"start_training_step":4198001,"data":{"dataset":{"shuffle_files":false,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"return_positions":true,"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"pad_samples_to_global_batch_size":false},"num_loading_workers":0,"seed":6},"name":"decay stage"}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
5h 40m 24s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"name":"stable","sequence_length":4096,"start_training_step":1,"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"vocab_size":128256,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"return_positions":true,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false},"num_loading_workers":0,"seed":6}},{"data":{"seed":6,"dataset":{"skip_in_stream":false,"token_size_in_bytes":4,"vocab_size":128256,"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004]},"num_loading_workers":0},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"data":{"num_loading_workers":0,"seed":6,"dataset":{"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"]}},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
32m 16s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"name":"stable","sequence_length":4096,"start_training_step":1,"data":{"seed":6,"dataset":{"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"return_positions":true,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032]},"num_loading_workers":0}},{"data":{"dataset":{"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"return_positions":true,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B"},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"start_training_step":4198001,"data":{"dataset":{"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"return_positions":true,"skip_in_stream":false,"use_old_brrr_dataloader":false,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"]},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
32m 16s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"dataset":{"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"return_positions":true,"shuffle_files":false,"skip_in_stream":false},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1},{"data":{"num_loading_workers":0,"seed":6,"dataset":{"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"shuffle_files":false,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false}},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"pad_samples_to_global_batch_size":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","vocab_size":128256},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
46s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"dataset":{"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"pad_samples_to_global_batch_size":false,"return_positions":true,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_max_tokens":null,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"]},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1},{"data":{"dataset":{"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"return_positions":true,"shuffle_files":false,"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B"},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"data":{"dataset":{"skip_in_stream":false,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"return_positions":true,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false,"shuffle_files":false},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
1d 14h 58m 54s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"dataset":{"return_positions":true,"shuffle_files":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false,"skip_in_stream":false,"use_old_brrr_dataloader":false,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032]},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1},{"data":{"dataset":{"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"dataset_max_tokens":null,"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"]},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"data":{"dataset":{"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"use_old_brrr_dataloader":false,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","vocab_size":128256,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002]},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
12h 7m 16s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"num_loading_workers":0,"seed":6,"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"skip_in_stream":false,"token_size_in_bytes":4,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"return_positions":true,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256}},"name":"stable","sequence_length":4096,"start_training_step":1},{"data":{"dataset":{"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false,"return_positions":true,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"]},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"shuffle_files":false,"vocab_size":128256,"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"pad_samples_to_global_batch_size":false,"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
17h 26m 43s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"name":"stable","sequence_length":4096,"start_training_step":1,"data":{"dataset":{"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"token_size_in_bytes":4,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false},"num_loading_workers":0,"seed":6}},{"data":{"dataset":{"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_max_tokens":null,"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004]},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"name":"decay stage","sequence_length":4096,"start_training_step":4198001,"data":{"seed":6,"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"pad_samples_to_global_batch_size":false,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"return_positions":true,"shuffle_files":false,"token_size_in_bytes":4},"num_loading_workers":0}}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
1m
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"dataset":{"return_positions":true,"vocab_size":128256,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"pad_samples_to_global_batch_size":false},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1},{"data":{"dataset":{"dataset_max_tokens":null,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"pad_samples_to_global_batch_size":false},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001},{"data":{"dataset":{"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized_4097/mwiki/standard","s3://smollm3/datasets/llama_tokenized_4097/cosmopedia2/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/","s3://smollm3/datasets/llama_tokenized_4097/openmathinstruct-2/standard/","s3://smollm3/datasets/llama_tokenized_4097/openmathreasoning-4k/tokenized/","s3://smollm3/datasets/llama_tokenized_4097/opencodereasoning-4k-fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/natural_reasoning_fix/standard/","s3://smollm3/datasets/llama_tokenized_4097/problem-solving/standard/","s3://smollm3/datasets/llama_tokenized_4097/2students/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth_gsm_gsm8k/standard/","s3://smollm3/datasets/llama_tokenized_4097/dolmino_math_synth/standard/"],"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false,"shuffle_files":false,"skip_in_stream":false,"vocab_size":128256,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/multilingual_wiki","/scratch/smollm3-data-part1/cosmopedia2","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues","/scratch/smollm3-data-part1/openmathinstruct-2","/scratch/smollm3-data-part1/openmathreasoning-4k","/scratch/smollm3-data-part1/open-codereasoning-4k","/scratch/smollm3-data-part1/natural_reasoning","/scratch/smollm3-data-part1/tiny-gsm-mind-problem-solving","/scratch/smollm3-data-part1/tiny-gsm-mind-2students","/scratch/smollm3-data-part1/dolmino_math_synth_gsm_gsm8k","/scratch/smollm3-data-part1/dolmino_math_synth_basic"],"dataset_weights":[0.2,0.3,0.002,0.0002,0.008,0.004,0.001,0.018,0.022,0.023,0.0125,0.0045,0.01,0.01,0.009,0.0032,0.0032,0.0032,0.0032,0.00005,0.0022,0.002,0.002,0.02,0.025,0.014,0.002,0.05,0.07,0.018,0.018,0.008,0.044,0.006,0.006,0.003,0.002,0.013,0.001,0.005,0.0102,0.001,0.0005,0.006,0.005,0.0006,0.012,0.004,0.005,0.005,0.0005,0.001,0.003,0.003,0.0004,0.0002],"return_positions":true,"token_size_in_bytes":4},"num_loading_workers":0,"seed":6},"name":"decay stage","sequence_length":4096,"start_training_step":4198001}]
true
smollm3-3B-final
loubna-48n-2105-decay
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Finished
loubnabnl
1d 7h 34m 41s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"sequence_length":4096,"start_training_step":1,"data":{"dataset":{"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"]},"num_loading_workers":0,"seed":6},"name":"stable"},{"data":{"dataset":{"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B"},"num_loading_workers":0,"seed":6},"name":"stable stage 2","sequence_length":4096,"start_training_step":3450001}]
true
smollm3-3B-final
elie-48n-2105-stage2
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
6000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
loubnabnl
2d 23h 16m 22s
-
2000
/scratch/loubna-checkpoints-stage2
false
true
true
true
false
[{"data":{"dataset":{"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"shuffle_files":false,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"]},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1},{"sequence_length":4096,"start_training_step":3450001,"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized_4097/megamath-web-pro/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-qa-qwen/standard/","s3://smollm3/datasets/llama_tokenized_4097/megamath-text-code-block/tokenized/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/megamath-web-pro","/scratch/smollm3-data-part1/megamath-qa-qwen","/scratch/smollm3-data-part1/megamath-text-code-block","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"return_positions":true,"token_size_in_bytes":4,"dataset_weights":[0.3,0.33,0.016,0.001,0.002,0.016,0.02,0.0232,0.0105,0.01,0.01,0.01,0.002,0.00325,0.00325,0.00325,0.00325,0.00005,0.00225,0.01,0.01,0.01,0.02,0.02,0.0008,0.02,0.025,0.01725,0.01625,0.007,0.018,0.006,0.004,0.003,0.001,0.006,0.0002,0.00611,0.00614,0.0008,0.0005,0.0001,0.0114,0.0005,0.01,0.004],"shuffle_files":false,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"vocab_size":128256},"num_loading_workers":0,"seed":6},"name":"stable stage 2"}]
true
smollm3-3B-final
elie-48n-2105-stage2
6
8
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_configs/smollm3_eval.yaml
4000
-
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/evals-ckpt
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/logs
/fsx/loubna/projects_v2/smollm3/nanotron
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/loubna/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/loubna/projects_v2/smollm3/nanotron/ablations/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Finished
eliebak
12h 33s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"start_training_step":1,"data":{"dataset":{"dataset_max_tokens":null,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"pad_samples_to_global_batch_size":false,"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096}]
true
smollm3-3B-final
elie-48n-1205-part1-fix
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
eliebak
2h 50m 31s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"return_positions":true,"skip_in_stream":false,"token_size_in_bytes":4,"use_old_brrr_dataloader":false,"pad_samples_to_global_batch_size":false,"shuffle_files":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","vocab_size":128256},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1}]
true
smollm3-3B-final
elie-48n-1205-part1-fix
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
eliebak
21h 28m 30s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"return_positions":true,"skip_in_stream":false,"vocab_size":128256},"num_loading_workers":0,"seed":6},"name":"stable","sequence_length":4096,"start_training_step":1}]
true
smollm3-3B-final
elie-48n-1205-part1-fix
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
eliebak
13h 4m 24s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"num_loading_workers":0,"seed":6,"dataset":{"return_positions":true,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"shuffle_files":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false}},"name":"stable","sequence_length":4096,"start_training_step":1}]
true
smollm3-3B-final
elie-48n-1205-part1
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
eliebak
1d 18h 42m 34s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"num_loading_workers":0,"seed":6,"dataset":{"dataset_max_tokens":null,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"pad_samples_to_global_batch_size":false,"shuffle_files":false,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"return_positions":true,"token_size_in_bytes":4,"vocab_size":128256}},"name":"stable","start_training_step":1}]
true
smollm3-3B-final
elie-48n-0505-part1
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Failed
eliebak
2s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"dataset":{"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath-4plus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-real-shuffled-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/infiwebmath-4plus","/scratch/smollm3-data-part1/finemath-4plus","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Python","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Java","/scratch/smollm3-data-part1/stack-edu-real-shuffled-JavaScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Cpp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-C-Sharp","/scratch/smollm3-data-part1/stack-edu-real-shuffled-PHP","/scratch/smollm3-data-part1/stack-edu-real-shuffled-TypeScript","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Swift","/scratch/smollm3-data-part1/stack-edu-real-shuffled-SQL","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Ruby","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Rust","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Go","/scratch/smollm3-data-part1/stack-edu-real-shuffled-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"dataset_weights":[0.2132,0.325,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.02,0.025,0.02,0.035,0.07,0.013,0.013,0.007,0.045,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.01,0.01,0.011,0.004],"return_positions":true,"shuffle_files":false,"use_old_brrr_dataloader":false,"vocab_size":128256,"dataset_max_tokens":null,"pad_samples_to_global_batch_size":false,"skip_in_stream":false,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B"},"num_loading_workers":0,"seed":6},"name":"stable","start_training_step":1}]
true
smollm3-3B-final
elie-48n-0705-decay
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
eliebak
1d 13h 2m 33s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"name":"stable","start_training_step":1,"data":{"dataset":{"token_size_in_bytes":4,"vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_max_tokens":null,"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"return_positions":true,"shuffle_files":false,"skip_in_stream":false,"tokenizer_name":"meta-llama/Llama-3.2-1B","use_old_brrr_dataloader":false,"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"pad_samples_to_global_batch_size":false},"num_loading_workers":0,"seed":6}}]
true
smollm3-3B-final
elie-48n-0505-part1
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
Crashed
eliebak
3d 10h 59m 34s
-
2000
/scratch/elie/checkpoints
false
true
true
true
false
[{"data":{"dataset":{"dataset_weights":[0.333,0.37,0.02,0.001,0.004,0.016,0.02,0.022,0.0105,0.01,0.01,0.01,0.003,0.00325,0.00325,0.00325,0.00325,0.00325,0.00225,0.01,0.017,0.025,0.013,0.013,0.007,0.018,0.006,0.006,0.003,0.001,0.004,0.0008,0.005,0.006,0.0008,0.0005,0.0007,0.006,0.0005,0.0055,0.0032],"pad_samples_to_global_batch_size":false,"return_positions":true,"token_size_in_bytes":4,"tokenizer_name":"meta-llama/Llama-3.2-1B","vocab_size":128256,"dataset_folder":["s3://smollm3/datasets/llama_tokenized-global-chunks/fineweb-edu/fineweb-edu/","s3://smollm3/datasets/llama_tokenized-global-chunks/dclm/dclm/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pes2o/","s3://smollm3/datasets/llama_tokenized-individual-chunks/wiki/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stackexchange/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fra/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-spa/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-deu/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ita/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-por/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-cmn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-rus/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-fas/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-jpn/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-kor/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-hin/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-tha/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-vie/","s3://smollm3/datasets/llama_tokenized-individual-chunks/fw2-ell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/infiwebmath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/finemath/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Python/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Java/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-JavaScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Cpp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-C-Sharp/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-PHP/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-TypeScript/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Swift/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-SQL/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Ruby/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Markdown/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-HTML/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Rust/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Go/","s3://smollm3/datasets/llama_tokenized-individual-chunks/stack-edu-Shell/","s3://smollm3/datasets/llama_tokenized-individual-chunks/pull-requests/","s3://smollm3/datasets/llama_tokenized-individual-chunks/kaggle/","s3://smollm3/datasets/llama_tokenized-individual-chunks/jupyter-scripts/","s3://smollm3/datasets/llama_tokenized-individual-chunks/github-issues/"],"dataset_read_path":["/scratch/smollm3-data-part1/fineweb-edu","/scratch/smollm3-data-part1/dclm","/scratch/smollm3-data-part1/pes2o","/scratch/smollm3-data-part1/wiki","/scratch/smollm3-data-part1/stackexchange","/scratch/smollm3-data-part1/fw2-fra","/scratch/smollm3-data-part1/fw2-spa","/scratch/smollm3-data-part1/fw2-deu","/scratch/smollm3-data-part1/fw2-ita","/scratch/smollm3-data-part1/fw2-por","/scratch/smollm3-data-part1/fw2-cmn","/scratch/smollm3-data-part1/fw2-rus","/scratch/smollm3-data-part1/fw2-fas","/scratch/smollm3-data-part1/fw2-jpn","/scratch/smollm3-data-part1/fw2-kor","/scratch/smollm3-data-part1/fw2-hin","/scratch/smollm3-data-part1/fw2-tha","/scratch/smollm3-data-part1/fw2-vie","/scratch/smollm3-data-part1/fw2-ell","/scratch/smollm3-data-part1/infiwebmath","/scratch/smollm3-data-part1/finemath","/scratch/smollm3-data-part1/stack-edu-Python","/scratch/smollm3-data-part1/stack-edu-Java","/scratch/smollm3-data-part1/stack-edu-JavaScript","/scratch/smollm3-data-part1/stack-edu-C","/scratch/smollm3-data-part1/stack-edu-Cpp","/scratch/smollm3-data-part1/stack-edu-C-Sharp","/scratch/smollm3-data-part1/stack-edu-PHP","/scratch/smollm3-data-part1/stack-edu-TypeScript","/scratch/smollm3-data-part1/stack-edu-Swift","/scratch/smollm3-data-part1/stack-edu-SQL","/scratch/smollm3-data-part1/stack-edu-Ruby","/scratch/smollm3-data-part1/stack-edu-Markdown","/scratch/smollm3-data-part1/stack-edu-HTML","/scratch/smollm3-data-part1/stack-edu-Rust","/scratch/smollm3-data-part1/stack-edu-Go","/scratch/smollm3-data-part1/stack-edu-Shell","/scratch/smollm3-data-part1/pull-requests","/scratch/smollm3-data-part1/kaggle","/scratch/smollm3-data-part1/jupyter-scripts","/scratch/smollm3-data-part1/github-issues"],"shuffle_files":false,"skip_in_stream":false,"use_old_brrr_dataloader":false,"dataset_max_tokens":null},"num_loading_workers":0,"seed":6},"name":"stable","start_training_step":1}]
true
smollm3-3B-final
elie-48n-1704-part1
6
8
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/smollm3_eval.yaml
4000
-
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/evals-ckpt
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/logs
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/results
1
4
1
false
1
afab
false
2
true
ALL_REDUCE
true
88
8
/fsx/elie_bakouch/.cache/huggingface
hopper-prod
normal
smollm
01:59:00
/fsx/elie_bakouch/smollm3_training/1004-nn/nanotron/_final1004/eval_results/launch-config
true
huggingface
smollm3-3B-evals
1-20
of 29