HarderThenHarder/transformers_tasks

训练时带参数--quantization_bit 4 报错RuntimeError: self and mat2 must have the same dtype

shangzhensen opened this issue · 3 comments

                                  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
                                  ┃ key                       ┃ value                          ┃
                                  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
                                  │ train_path                │ data/mixed_train_dataset.jsonl │
                                  │ dev_path                  │ data/mixed_dev_dataset.jsonl   │
                                  │ save_dir                  │ checkpoints/finetune           │
                                  │ max_source_seq_len        │ 8                              │
                                  │ max_target_seq_len        │ 8                              │
                                  │ batch_size                │ 1                              │
                                  │ learning_rate             │ 3e-05                          │
                                  │ weight_decay              │ 0.0                            │
                                  │ num_train_epochs          │ 2                              │
                                  │ warmup_ratio              │ 0.0                            │
                                  │ save_freq                 │ 1000                           │
                                  │ logging_steps             │ 100                            │
                                  │ device                    │ cuda:1                         │
                                  │ img_log_dir               │ log/fintune_log                │
                                  │ img_log_name              │ ChatGLM Fine-Tune              │
                                  │ use_lora                  │ True                           │
                                  │ use_ptuning               │ False                          │
                                  │ lora_rank                 │ 4                              │
                                  │ pre_seq_len               │ 128                            │
                                  │ prefix_projection         │ False                          │
                                  │ preprocessing_num_workers │ 1                              │
                                  │ quantization_bit          │ 4                              │
                                  └───────────────────────────┴────────────────────────────────┘

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /ml/szs/Project/LLM_szs/transformers_tasks/LLM/finetune/train.py:352 in │
│ │
│ 349 │
│ 350 │
│ 351 if name == "main": │
│ ❱ 352 │ main() │
│ 353 │
│ │
│ /ml/szs/Project/LLM_szs/transformers_tasks/LLM/finetune/train.py:295 in main │
│ │
│ 292 │ │ for batch in train_dataloader: │
│ 293 │ │ │ if args.use_lora: │
│ 294 │ │ │ │ with autocast(): │
│ ❱ 295 │ │ │ │ │ loss = model( │
│ 296 │ │ │ │ │ │ input_ids=batch['input_ids'].to(dtype=torch.long, device=args.de │
│ 297 │ │ │ │ │ │ labels=batch['labels'].to(dtype=torch.long, device=args.device) │
│ 298 │ │ │ │ │ ).loss │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/peft/peft_model.py:678 in forward │
│ │
│ 675 │ ): │
│ 676 │ │ peft_config = self.active_peft_config │
│ 677 │ │ if not isinstance(peft_config, PromptLearningConfig): │
│ ❱ 678 │ │ │ return self.base_model( │
│ 679 │ │ │ │ input_ids=input_ids, │
│ 680 │ │ │ │ attention_mask=attention_mask, │
│ 681 │ │ │ │ inputs_embeds=inputs_embeds, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:1160 in forward │
│ │
│ 1157 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │
│ 1158 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │
│ 1159 │ │ │
│ ❱ 1160 │ │ transformer_outputs = self.transformer( │
│ 1161 │ │ │ input_ids=input_ids, │
│ 1162 │ │ │ position_ids=position_ids, │
│ 1163 │ │ │ attention_mask=attention_mask, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:973 in forward │
│ │
│ 970 │ │ │ │ │ output_attentions │
│ 971 │ │ │ │ ) │
│ 972 │ │ │ else: │
│ ❱ 973 │ │ │ │ layer_ret = layer( │
│ 974 │ │ │ │ │ hidden_states, │
│ 975 │ │ │ │ │ position_ids=position_ids, │
│ 976 │ │ │ │ │ attention_mask=attention_mask, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:614 in forward │
│ │
│ 611 │ │ attention_input = self.input_layernorm(hidden_states) │
│ 612 │ │ │
│ 613 │ │ # Self attention. │
│ ❱ 614 │ │ attention_outputs = self.attention( │
│ 615 │ │ │ attention_input, │
│ 616 │ │ │ position_ids, │
│ 617 │ │ │ attention_mask=attention_mask, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:439 in forward │
│ │
│ 436 │ │ """ │
│ 437 │ │ │
│ 438 │ │ # [seq_len, batch, 3 * hidden_size] │
│ ❱ 439 │ │ mixed_raw_layer = self.query_key_value(hidden_states) │
│ 440 │ │ │
│ 441 │ │ # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 │
│ 442 │ │ new_tensor_shape = mixed_raw_layer.size()[:-1] + ( │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/peft/tuners/lora.py:565 in forward │
│ │
│ 562 │ │ │ │ self.unmerge() │
│ 563 │ │ │ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self. │
│ 564 │ │ elif self.r[self.active_adapter] > 0 and not self.merged: │
│ ❱ 565 │ │ │ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self. │
│ 566 │ │ │ │
│ 567 │ │ │ x = x.to(self.lora_A[self.active_adapter].weight.dtype) │
│ 568 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: self and mat2 must have the same dtype
量化后的模型,训练的时候会提示类型不匹配,多卡也是同样的问题,训练时带参数--quantization_bit 4
@HarderThenHarder 麻烦问下如何解决

@shangzhensen 请问下解决了么,碰到相同的问题

同求