NTK长度外推,
IT-five opened this issue · 3 comments
IT-five commented
提交前必须检查以下项目
- 请确保使用的是仓库最新代码(git pull),一些问题已被解决和修复。
- 我已阅读项目文档和FAQ章节并且已在Issue中对问题进行了搜索,没有找到相似问题和解决方案。
- 第三方插件问题:例如llama.cpp、LangChain、text-generation-webui等,同时建议到对应的项目中查找解决方案。
问题类型
模型推理
基础模型
Others
操作系统
macOS
详细描述问题
from datasets import load_dataset
import torch
import random
import numpy as np
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tqdm import tqdm
import os
import argparse
import sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
# from attn_and_long_ctx_patches import apply_attention_patch, apply_ntk_scaling_patch
dir_path = os.path.dirname(os.path.realpath(__file__))
# 模型地址
model_path = r'/mnt/data/extend_length/Baichuan2_7B_Chat'
# 数据地址
file_path = r'/mnt/data/extend_length/longbench_data/data/'
# 百川2的prompt模板(百川会自动帮你添加user_token_id和assistant_token_id)
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
"""
参数说明
--model_path ${model_path}:待评测模型所在目录(完整的Chinese-LLaMA-2或Chinese-Alpaca-2模型,非LoRA)
--predict_on {data_class}: 指定待预测的任务,可以为en,zh,code,或它们的组合,以逗号分隔,如en,zh,code
--output_dir ${output_dir}:评测结果的输出目录
--max_length ${max_length}:指令的最大长度。注意此长度不包括system prompt以及任务相关prompt在内。
--gpus ${gpus}:如需指定特定的GPU,请使用此参数,如0,1。
--alpha ${alpha}: NTK上下文扩展方法系数。一般设为待处理文本长度 / 模型上下文长度 * 2 - 1。或更方便地设为auto即可。
--e:在LongBench-E数据集上进行预测。参考LongBench官方文档以了解LongBench-E的详细说明。
"""
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str)
parser.add_argument('--predict_on',type=str, default='zh')
parser.add_argument('--output_dir',type=str, default='pred')
parser.add_argument('--gpus',type=str, default=None)
parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
args = parser.parse_args()
model_path = args.model_path
predict_on = args.predict_on
output_dir = args.output_dir
gpus=args.gpus
print(f"Model Path: {model_path}")
print(f"Predict On: {predict_on}")
print(f"Output Directory: {output_dir}")
print(f"GPUs: {gpus}")
print(f"Evaluate Flag: {args.e}")
DO_SAMPLE =True
TEMPERATURE = 0.2
REPETITION_PENALTY = 1.1
TOP_P = 0.95
TOP_K = 40
if gpus is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = gpus
# apply_attention_patch(use_memory_efficient_attention=True)
# apply_ntk_scaling_patch(args.alpha)
def fill_baichuan2_prompt_template(instruction, with_system_prompt = False, system_prompt = DEFAULT_SYSTEM_PROMPT):
messages = []
if with_system_prompt is True:
messages.append({'role': 'system', 'content': system_prompt})
messages.append({"role": "user", "content": instruction})
return messages
else:
return messages.append({"role": "user", "content": instruction})
def get_pred(model, tokenizer, data, max_gen, prompt_format, dataset, device):
preds = []
for json_obj in tqdm(data):
prompt = prompt_format.format(**json_obj)
# truncate to fit max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
# tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
# if len(tokenized_prompt) > max_length:
# half = int(max_length/2)
# prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
if dataset not in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]: # chat models are better off without build prompts on these tasks
raw_data = fill_baichuan2_prompt_template(instruction=prompt,with_system_prompt=True)
else:
raw_data = fill_baichuan2_prompt_template(instruction=prompt)
input_data = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
context_length = input_data.input_ids.shape[-1]
if dataset == "samsum": # prevent illegal output on samsum (model endlessly repeat "\nDialogue"), might be a prompting issue
generation_config = {
"pad_token_id": 0,
"bos_token_id": 1,
# eos_token_id=[2,5]
"eos_token_id": [tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
"user_token_id": 195,
"assistant_token_id": 196,
# 不同数据的最大输出tokens不同
"max_new_tokens": max_gen,
"min_length":context_length + 1,
"temperature": 1.0,
"top_k": 15,
"top_p": 0.90,
"repetition_penalty": 1.00,
"do_sample": False,
"transformers_version": "4.29.2"
}
model.generation_config = GenerationConfig(**generation_config)
else:
generation_config = {
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"user_token_id": 195,
"assistant_token_id": 196,
"max_new_tokens": max_gen,
"temperature": 1.0,
"top_k": 15,
"top_p": 0.90,
"repetition_penalty": 1.00,
"do_sample": False,
"transformers_version": "4.29.2"
}
model.generation_config = GenerationConfig(**generation_config)
pred = model.chat(tokenizer, input_data)
preds.append({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]})
return preds
def seed_everything(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed_all(seed)
if __name__ == '__main__':
seed_everything(42)
load_type = torch.float16
if torch.cuda.is_available():
device = torch.device(0)
else:
device = torch.device('cpu')
# 使用longbench-E 或者 longbench
if args.e:
en_datasets = [ "hotpotqa","2wikimqa",
"qasper", "multifieldqa_en", "gov_report",
"trec", "samsum", "triviaqa",
"passage_count", "passage_retrieval_en", "multi_news"]
zh_datasets = []
code_datasets = [ "lcc", "repobench-p" ]
if not os.path.exists(f"{output_dir}/pred_e"):
os.makedirs(f"{output_dir}/pred_e")
else:
en_datasets = [ "hotpotqa","2wikimqa", "musique", "narrativeqa",
"qasper", "multifieldqa_en", "gov_report",
"qmsum", "trec", "samsum", "triviaqa",
"passage_count", "passage_retrieval_en", "multi_news"]
zh_datasets = [ "dureader", "multifieldqa_zh",
"vcsum","lsht", "passage_retrieval_zh"]
code_datasets = [ "lcc", "repobench-p" ]
if not os.path.exists(f"{output_dir}/pred"):
os.makedirs(f"{output_dir}/pred")
# 选择需要使用哪些子数据集
datasets = []
for data_type in predict_on.split(','):
if data_type == 'zh':
datasets += zh_datasets
elif data_type == 'en':
datasets += en_datasets
elif data_type == 'code':
datasets += code_datasets
print(datasets)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code = True)
# 加载model
model = model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype = torch.float16, trust_remote_code=True)
model = model.eval()
model_vocab_size = model.get_input_embeddings().weight.size(0)
print(f"Vocab of the base model: {model_vocab_size}")
tokenizer_vocab_size = len(tokenizer)
print(f"Vocab of the tokenizer: {tokenizer_vocab_size}")
# we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
print(f"加载配置文件dataset2prompt,dataset2maxlen")
dataset2prompt = json.load(open(dir_path + "/config/dataset2prompt.json", "r"))
dataset2maxlen = json.load(open(dir_path + "/config/dataset2maxlen.json", "r"))
print(f"加载成功")
# predict on each dataset
for dataset in datasets:
print(f"Loading dataset {dataset}")
if args.e:
# data = load_dataset('THUDM/LongBench', dataset+'_e', split='test')
data = load_dataset('json', data_files=file_path+dataset+'_e'+'.jsonl', split='train')
output_path = f"{output_dir}/pred_e/{dataset}.jsonl"
else:
data = load_dataset('json', data_files=file_path+dataset+'.jsonl', split='train')
output_path = f"{output_dir}/pred/{dataset}.jsonl"
prompt_format = dataset2prompt[dataset]
max_gen = dataset2maxlen[dataset]
# preds = get_pred(model, tokenizer, data, max_length, max_gen, prompt_format, dataset, device)
preds = get_pred(model, tokenizer, data, max_gen, prompt_format, dataset, device)
with open(output_path, "w", encoding="utf-8") as f:
for pred in preds:
json.dump(pred, f, ensure_ascii=False)
f.write('\n')
1.我看源码中,将长度超过的部分进行了截断,保留开始和结尾部分,那这样对于NTK来说,还会涉及到插值吗,他永远不会超过模型的max_input_length
2. 我将截断部分的逻辑去除后跑baichuan2,用的是A800,但是报OOM
依赖情况(代码类问题务必提供)
bitsandbytes 0.41.1
open-clip-torch 2.20.0
peft 0.5.0
pytorch-lightning 1.7.7
pytorch-metric-learning 2.3.0
pytorch-wavelets 1.3.0
pytorch-wpe 0.0.1
pytorch3d 0.7.4
rotary-embedding-torch 0.3.0
sentencepiece 0.1.99
taming-transformers-rom1504 0.0.6
torch 2.0.1+cu118
torch-complex 0.4.3
torch-scatter 2.1.1
torchaudio 2.0.2+cu118
torchmetrics 0.11.4
torchsummary 1.5.1
torchvision 0.15.2+cu118
transformers 4.34.1
transformers-stream-generator 0.0.4
运行日志或截图
Traceback (most recent call last):
File "pred_baichuan2.py", line 241, in <module>
preds = get_pred(model, tokenizer, data, max_gen, prompt_format, dataset, device)
File "pred_baichuan2.py", line 154, in get_pred
pred = model.chat(tokenizer, raw_data)
File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 853, in chat
outputs = self.generate(input_ids, generation_config=generation_config)
File "/opt/conda/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py", line 1652, in generate
return self.sample(
File "/opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py", line 2734, in sample
outputs = self(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 756, in forward
outputs = self.model(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 533, in forward
layer_outputs = decoder_layer(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 345, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 306, in forward
attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 27.74 GiB (GPU 1; 79.35 GiB total capacity; 32.57 GiB already allocated; 17.55 GiB free; 60.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
iMountTai commented
- NTK没有插值的操作,
max_input_length
本身就是为了限制测试样例的输入长度以适配模型的支持长度。 - 不好意思,Baichuan的相关问题麻烦去对应项目下提问。
github-actions commented
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your consideration.
github-actions commented
Closing the issue, since no updates observed. Feel free to re-open if you need any further assistance.