lyhue1991/torchkeras

baichuan13b enr 训练出错

zhouzhou0322 opened this issue · 0 comments

out = peft_model.forward(**batch)
out = peft_model.forward(**batch)

AttributeError Traceback (most recent call last)
Cell In[59], line 1
----> 1 out = peft_model.forward(**batch)

File /usr/local/lib/python3.10/dist-packages/peft/peft_model.py:931, in PeftModelForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
920 raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds")
921 return self.base_model(
922 input_ids=input_ids,
923 attention_mask=attention_mask,
(...)
928 **kwargs,
929 )
--> 931 return self.base_model(
932 input_ids=input_ids,
933 attention_mask=attention_mask,
934 inputs_embeds=inputs_embeds,
935 labels=labels,
936 output_attentions=output_attentions,
937 output_hidden_states=output_hidden_states,
938 return_dict=return_dict,
939 **kwargs,
940 )
942 batch_size = _get_batch_size(input_ids, inputs_embeds)
943 if attention_mask is not None:
944 # concat prompt attention mask

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/peft/tuners/adalora/model.py:234, in AdaLoraModel.forward(self, *args, **kwargs)
233 def forward(self, *args, **kwargs):
--> 234 outputs = self.model.forward(*args, **kwargs)
236 if getattr(outputs, "loss", None) is not None:
237 # Calculate the orthogonal regularization
238 orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight

File /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/Baichuan-13B-Chat/modeling_baichuan.py:447, in BaichuanForCausalLM.forward(self, input_ids, attention_mask, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)
444 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
446 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--> 447 outputs = self.model(
448 input_ids=input_ids,
449 attention_mask=attention_mask,
450 past_key_values=past_key_values,
451 inputs_embeds=inputs_embeds,
452 use_cache=use_cache,
453 output_attentions=output_attentions,
454 output_hidden_states=output_hidden_states,
455 return_dict=return_dict,
456 )
458 hidden_states = outputs[0]
459 logits = self.lm_head(hidden_states)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/Baichuan-13B-Chat/modeling_baichuan.py:370, in BaichuanModel.forward(self, input_ids, attention_mask, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
363 layer_outputs = torch.utils.checkpoint.checkpoint(
364 create_custom_forward(decoder_layer),
365 hidden_states,
366 attention_mask,
367 None,
368 )
369 else:
--> 370 layer_outputs = decoder_layer(
371 hidden_states,
372 attention_mask=attention_mask,
373 past_key_value=past_key_value,
374 output_attentions=output_attentions,
375 use_cache=use_cache,
376 )
378 hidden_states = layer_outputs[0]
380 if use_cache:

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/Baichuan-13B-Chat/modeling_baichuan.py:193, in BaichuanLayer.forward(self, hidden_states, attention_mask, past_key_value, output_attentions, use_cache)
190 hidden_states = self.input_layernorm(hidden_states)
192 # Self Attention
--> 193 hidden_states, self_attn_weights, present_key_value = self.self_attn(
194 hidden_states=hidden_states,
195 attention_mask=attention_mask,
196 past_key_value=past_key_value,
197 output_attentions=output_attentions,
198 use_cache=use_cache,
199 )
200 hidden_states = residual + hidden_states
202 # Fully Connected

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/Baichuan-13B-Chat/modeling_baichuan.py:124, in BaichuanAttention.forward(self, hidden_states, attention_mask, past_key_value, output_attentions, use_cache)
113 def forward(
114 self,
115 hidden_states: torch.Tensor,
(...)
119 use_cache: bool = False,
120 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
122 bsz, q_len, _ = hidden_states.size()
--> 124 proj = self.W_pack(hidden_states)
125 proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
126 query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/peft/tuners/adalora/bnb.py:145, in SVDLinear4bit.forward(self, x)
143 if requires_conversion:
144 expected_dtype = result.dtype
--> 145 compute_dtype = lora_A.weight.dtype
146 if x.dtype != compute_dtype:
147 x = x.to(compute_dtype)

AttributeError: 'Parameter' object has no attribute 'weight'

可否提供下torch 和 peft 的版本