Skip to main content

Chilli's group workspace

Timestamps visible
2025-04-04 06:19:17
[rank0]:     if not isinstance(checkpoint['num_experts'], list):
2025-04-04 06:19:17
[rank0]: KeyError: 'num_experts'
2025-04-04 06:19:17
[rank0]: Traceback (most recent call last):
2025-04-04 06:19:17
[rank0]:   File "/mnt/ssd-1/nora/gpt-neox/train.py", line 35, in <module>
2025-04-04 06:19:17
[rank0]:     main()
2025-04-04 06:19:17
[rank0]:   File "/mnt/ssd-1/nora/gpt-neox/train.py", line 31, in main
2025-04-04 06:19:17
[rank0]:     pretrain(neox_args=neox_args)
2025-04-04 06:19:17
[rank0]:   File "/mnt/ssd-1/nora/gpt-neox/megatron/training.py", line 252, in pretrain
2025-04-04 06:19:17
[rank0]:     model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer(
2025-04-04 06:19:17
[rank0]:   File "/mnt/ssd-1/nora/gpt-neox/megatron/training.py", line 1328, in setup_model_and_optimizer
2025-04-04 06:19:17
[rank0]:     neox_args.iteration = load_checkpoint(
2025-04-04 06:19:17
[rank0]:   File "/mnt/ssd-1/nora/gpt-neox/megatron/checkpointing.py", line 390, in load_checkpoint
2025-04-04 06:19:17
[rank0]:     checkpoint_name, state_dict = model.load_checkpoint(
2025-04-04 06:19:17
[rank0]:   File "/home/nora/.local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2938, in load_checkpoint
2025-04-04 06:19:17
[rank0]:     load_path, client_states = self._load_checkpoint(load_dir,
2025-04-04 06:19:17
[rank0]:   File "/home/nora/.local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3010, in _load_checkpoint
2025-04-04 06:19:17
[rank0]:     if not isinstance(checkpoint['num_experts'], list):
2025-04-04 06:19:17
[rank0]: KeyError: 'num_experts'