datawhalechina/self-llm

deepseek-chat模型LorA微调完没有adapter_config.json

HolyCrazy opened this issue · 2 comments

deepseek-chat模型LorA微调完没有adapter_config.json,看其他issue里说,是因为transformer的版本问题,LorA微调出来的参数和基座模型的参数直接合并了,但是直接运行Lora微调出来的参数,模型回复的所有token都是0~,感觉是没有进行推理~

可以是尝试一下 transformers版本为 4.31.3

transformers 4.31.3没有~用了transformers==4.31.0~但还是不行~感觉是不是我脚本写的有问题~

MAX_LENGTH = 384 # Llama分词器会将一个中文字切分为多个token,因此需要放开一些最大长度,保证数据的完整性
path = '/mnt/bn/models/deepseek-coder-6.7b-instruct'
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False, trust_remote_code=True)
tokenizer.padding_side = 'right' # padding在右边

model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.half, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(path)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

def process_func(example):
input_ids, attention_mask, labels = [], [], []
instruction = tokenizer(f"User: {example['instruction']+example['input']}\n\n", add_special_tokens=False) # add_special_tokens 不在开头加 special_tokens
response = tokenizer(f"Assistant: {example['output']}<|end▁of▁sentence|>", add_special_tokens=False)
input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] # 因为eos token咱们也是要关注的所以 补充为1
labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
if len(input_ids) > MAX_LENGTH: # 做一个截断
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}

dataset = load_dataset("json", data_files={"/zhen_huan_dataset_1000.json"}, split = 'train')

processed_dataset = dataset.map(
process_func,
remove_columns=["instruction", "input", "output"]
)
print(processed_dataset)

config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
inference_mode=False, # 训练模式
r=8, # Lora 秩
lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理
lora_dropout=0.1# Dropout 比例
)

output_dir="/code/lora"

args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=8,
gradient_accumulation_steps=2,
logging_steps=10,
num_train_epochs=3,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True
)

trainer = Trainer(
model=model,
args=args,
train_dataset=processed_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
trainer.train()
print("train finish_______________")