MoE loss variable not defined in gpt j residual code path
tf-nv opened this issue · 0 comments
tf-nv commented
Running pythia 14M on master:
File "/gpt-neox/train.py", line 34, in <module>
main()
File "/gpt-neox/train.py", line 30, in main
pretrain(neox_args=neox_args)
File "/gpt-neox/megatron/training.py", line 228, in pretrain
iteration = train(
File "/gpt-neox/megatron/training.py", line 913, in train
loss_dict, skipped_iter = train_step(
File "/gpt-neox/megatron/training.py", line 793, in train_step
loss = forward_step(
File "/gpt-neox/megatron/training.py", line 391, in forward_step
maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 1822, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/gpt-neox/megatron/model/utils.py", line 190, in forward
x = func(forward_input)
File "/gpt-neox/megatron/model/utils.py", line 181, in exec_func
inputs = layer(inputs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/gpt-neox/megatron/model/transformer.py", line 1167, in forward
output, moe_loss = super().forward(hidden_states, attention_mask)
File "/gpt-neox/megatron/model/transformer.py", line 1155, in forward
return output, moe_loss
The moe_loss
variable is not defined inside the forward()
for ParallelTransformerLayer
, in the gpt-j-residual==true branch
The variable was introduced when #1129 was merged. I am not too familiar with MoE, maybe @yang can comment on this?