Global shape mismatch for loaded ((1024, 768)) and expected ((512, 768)) tensor for key model.embedding.position_embeddings.weight
Alireza3242 opened this issue · 3 comments
Alireza3242 commented
Describe the bug
I followed the instructions in:
https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/gpt/gpt_training.html
the i replace 1024 with 512
python src/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
--config-path=conf \
--config-name=megatron_gpt_config \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=30000 \
trainer.val_check_interval=300 \
trainer.log_every_n_steps=50 \
trainer.limit_val_batches=50 \
trainer.limit_test_batches=50 \
trainer.accumulate_grad_batches=1 \
trainer.precision=16 \
model.micro_batch_size=6 \
model.global_batch_size=192 \
model.tensor_model_parallel_size=1 \
model.pipeline_model_parallel_size=1 \
model.max_position_embeddings=512 \
model.encoder_seq_length=512 \
model.hidden_size=768 \
model.ffn_hidden_size=3072 \
model.num_layers=12 \
model.num_attention_heads=12 \
model.init_method_std=0.021 \
model.hidden_dropout=0.1 \
model.layernorm_epsilon=1e-5 \
model.tokenizer.vocab_file=data/tokenizer/gpt2-vocab.json \
model.tokenizer.merge_file=data/tokenizer/gpt2-merges.txt \
model.data.data_prefix=[1.0,data/tokenized_data/hfbpe_gpt_training_data_text_document] \
model.data.num_workers=2 \
model.data.seq_length=512 \
model.data.splits_string=\'980,10,10\' \
model.megatron_amp_O2=False \
model.optim.name=fused_adam \
model.optim.lr=6e-4 \
model.optim.betas=[0.9,0.95] \
model.optim.weight_decay=0.1 \
model.optim.sched.name=CosineAnnealing \
model.optim.sched.warmup_steps=750 \
model.optim.sched.constant_steps=80000 \
model.optim.sched.min_lr=6e-5 \
exp_manager.resume_if_exists=True \
exp_manager.resume_ignore_no_checkpoint=True \
exp_manager.create_checkpoint_callback=True \
exp_manager.checkpoint_callback_params.monitor=val_loss \
exp_manager.checkpoint_callback_params.save_top_k=3 \
exp_manager.checkpoint_callback_params.mode=min \
exp_manager.checkpoint_callback_params.always_save_nemo=False
It get this error:
Error executing job with overrides: ['trainer.devices=1', 'trainer.num_nodes=1', 'trainer.max_epochs=null', 'trainer.max_steps=30000', 'trainer.val_check_interval=300', 'trainer.log_every_n_steps=50', 'trainer.limit_val_batches=50', 'trainer.limit_test_batches=50', 'trainer.accumulate_grad_batches=1', 'trainer.precision=16', 'model.micro_batch_size=6', 'model.global_batch_size=192', 'model.tensor_model_parallel_size=1', 'model.pipeline_model_parallel_size=1', 'model.max_position_embeddings=512', 'model.encoder_seq_length=512', 'model.fp8_amax_history_len=512', 'model.hidden_size=768', 'model.ffn_hidden_size=3072', 'model.num_layers=12', 'model.num_attention_heads=12', 'model.init_method_std=0.021', 'model.hidden_dropout=0.1', 'model.layernorm_epsilon=1e-5', 'model.tokenizer.vocab_file=data/tokenizer/gpt2-vocab.json', 'model.tokenizer.merge_file=data/tokenizer/gpt2-merges.txt', 'model.data.data_prefix=[1.0,data/tokenized_data/hfbpe_gpt_training_data_text_document]', 'model.data.num_workers=2', 'model.data.seq_length=512', "model.data.splits_string='980,10,10'", 'model.megatron_amp_O2=False', 'model.optim.name=fused_adam', 'model.optim.lr=6e-4', 'model.optim.betas=[0.9,0.95]', 'model.optim.weight_decay=0.1', 'model.optim.sched.name=CosineAnnealing', 'model.optim.sched.warmup_steps=750', 'model.optim.sched.constant_steps=80000', 'model.optim.sched.min_lr=6e-5', 'exp_manager.resume_if_exists=True', 'exp_manager.resume_ignore_no_checkpoint=True', 'exp_manager.create_checkpoint_callback=True', 'exp_manager.checkpoint_callback_params.monitor=val_loss', 'exp_manager.checkpoint_callback_params.save_top_k=3', 'exp_manager.checkpoint_callback_params.mode=min', 'exp_manager.checkpoint_callback_params.always_save_nemo=False']
Traceback (most recent call last):
File "/app/src/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 42, in main
trainer.fit(model)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 973, in _run
self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 397, in _restore_modules_and_callbacks
self.resume_start(checkpoint_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 79, in resume_start
loaded_checkpoint = self.trainer.strategy.load_checkpoint(checkpoint_path)
File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 441, in load_checkpoint
return self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=checkpoint)
File "/opt/NeMo/nemo/utils/callbacks/dist_ckpt_io.py", line 78, in load_checkpoint
return dist_checkpointing.load(
File "/opt/megatron-lm/megatron/core/dist_checkpointing/serialization.py", line 135, in load
loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir)
File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 43, in load
dict_list_map_inplace(load_fn, sharded_state_dict)
File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 180, in dict_list_map_inplace
x[k] = dict_list_map_inplace(f, v)
File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 180, in dict_list_map_inplace
x[k] = dict_list_map_inplace(f, v)
File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 184, in dict_list_map_inplace
return f(x)
File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 80, in _load_from_array
x = _load_regular_chunk(sharded_tensor, checkpoint_dir)
File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 107, in _load_regular_chunk
raise CheckpointingException(_msg)
megatron.core.dist_checkpointing.core.CheckpointingException: Global shape mismatch for loaded ((1024, 768)) and expected ((512, 768)) tensor for key model.embedding.position_embeddings.weight
Environment details
A100
nemo docker:
nvcr.io/nvidia/nemo:24.05.01
Alireza3242 commented
I find it:
I have to delete this folder:
nemo_experiments/megatron_gpt/checkpoints