BlinkDL/RWKV-LM

训练RWKV-4,报错

fuxuelinwudi opened this issue · 1 comments

deepspeed==0.7.0 pytorch-lightning==1.9.2 torch 1.13.1+cu117
一样的版本;

Traceback (most recent call last):
File "summarization_pipeline.py", line 1382, in
main()
File "summarization_pipeline.py", line 1376, in main
train_ds(configs)
File "summarization_pipeline.py", line 1040, in train_ds
trainer.run(model=model, loss_fct=loss_fct,
File "/home/ubuntu/.local/lib/python3.8/site-packages/lightning_fabric/fabric.py", line 628, in _run_impl
return self._strategy.launcher.launch(run_method, *args, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/lightning_fabric/strategies/launchers/subprocess_script.py", line 90, in launch
return function(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/lightning_fabric/fabric.py", line 638, in _run_with_setup
return run_function(*args, **kwargs)
File "summarization_pipeline.py", line 888, in run
run_epoch('train')
File "summarization_pipeline.py", line 838, in run_epoch
self.backward(loss)
File "/home/ubuntu/.local/lib/python3.8/site-packages/lightning_fabric/fabric.py", line 359, in backward
self._precision.backward(tensor, module, *args, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/lightning_fabric/plugins/precision/precision.py", line 73, in backward
tensor.backward(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/autograd/init.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/ubuntu/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 800, in reduce_partition_and_remove_grads
self.reduce_ready_partitions_and_remove_grads(param, i)
File "/home/ubuntu/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1271, in reduce_ready_partitions_and_remove_grads
self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
File "/home/ubuntu/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 845, in reduce_independent_p_g_buckets_and_remove_grads
new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
AttributeError: 'DeepSpeedZeroOptimizer' object has no attribute 'ipg_index'

upgrade ubuntu to 20.04 or 22.04