通义千问Qwen-14B-Chat封装接口 kv cache 一直卡在94.1%,长时间无响应,然后其他服务进不来
Alwin4Zhang opened this issue · 2 comments
Alwin4Zhang commented
提交前必须检查以下项目 | The following items must be checked before submission
- 请确保使用的是仓库最新代码(git pull),一些问题已被解决和修复。 | Make sure you are using the latest code from the repository (git pull), some issues have already been addressed and fixed.
- 我已阅读项目文档和FAQ章节并且已在Issue中对问题进行了搜索,没有找到相似问题和解决方案 | I have searched the existing issues / discussions
问题类型 | Type of problem
模型推理和部署 | Model inference and deployment
操作系统 | Operating system
Linux
详细描述问题 | Detailed description of the problem
models.py 添加SWAP_SPACE=8的设置
import asyncio
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from sentence_transformers import SentenceTransformer
from api.apapter import get_prompt_adapter
from api.config import config
def create_app():
""" create fastapi app server """
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
return app
def create_embedding_model() -> SentenceTransformer:
""" get embedding model from sentence-transformers. """
return SentenceTransformer(config.EMBEDDING_NAME, device=config.EMBEDDING_DEVICE)
def create_generate_model():
""" get generate model for chat or completion. """
from api.generation import ModelServer
from api.apapter.model import load_model
if config.PATCH_TYPE == "attention":
from api.utils.patches import apply_attention_patch
apply_attention_patch(use_memory_efficient_attention=True)
if config.PATCH_TYPE == "ntk":
from api.utils.patches import apply_ntk_scaling_patch
apply_ntk_scaling_patch(config.ALPHA)
model, tokenizer = load_model(
config.MODEL_NAME,
model_name_or_path=config.MODEL_PATH,
adapter_model=config.ADAPTER_MODEL_PATH,
quantize=config.QUANTIZE,
device=config.DEVICE,
device_map=config.DEVICE_MAP,
num_gpus=config.NUM_GPUs,
load_in_8bit=config.LOAD_IN_8BIT,
load_in_4bit=config.LOAD_IN_4BIT,
use_ptuning_v2=config.USING_PTUNING_V2,
dtype=config.DTYPE,
resize_embeddings=config.RESIZE_EMBEDDINGS,
)
return ModelServer(
model,
tokenizer,
config.DEVICE,
model_name=config.MODEL_NAME,
context_len=config.CONTEXT_LEN,
stream_interval=config.STREAM_INTERVERL,
prompt_name=config.PROMPT_NAME,
use_streamer_v2=config.USE_STREAMER_V2,
)
def get_context_len(model_config) -> int:
""" fix for model max length. """
if "qwen" in config.MODEL_NAME.lower():
max_model_len = config.CONTEXT_LEN or 8192
else:
max_model_len = config.CONTEXT_LEN or model_config.max_model_len
return max_model_len
def create_vllm_engine():
""" get vllm generate engine for chat or completion. """
try:
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
return None
engine_args = AsyncEngineArgs(
model=config.MODEL_PATH,
tokenizer_mode=config.TOKENIZE_MODE,
trust_remote_code=config.TRUST_REMOTE_CODE,
dtype=config.DTYPE,
tensor_parallel_size=config.TENSOR_PARALLEL_SIZE,
gpu_memory_utilization=config.GPU_MEMORY_UTILIZATION,
**swap_space = config.SWAP_SPACE,**
max_num_batched_tokens=config.MAX_NUM_BATCHED_TOKENS,
max_num_seqs=config.MAX_NUM_SEQS,
max_model_len=config.CONTEXT_LEN,
quantization=config.QUANTIZATION_METHOD,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
# A separate tokenizer to map token IDs to strings.
engine.engine.tokenizer = get_tokenizer(
engine_args.tokenizer,
tokenizer_mode=engine_args.tokenizer_mode,
trust_remote_code=True,
)
# prompt adapter for constructing model inputs
engine.prompt_adapter = get_prompt_adapter(
config.MODEL_NAME.lower(),
prompt_name=config.PROMPT_NAME.lower() if config.PROMPT_NAME else None
)
engine_model_config = asyncio.run(engine.get_model_config())
engine.engine.scheduler_config.max_model_len = get_context_len(engine_model_config)
engine.max_model_len = get_context_len(engine_model_config)
return engine
# fastapi app
app = create_app()
# model for embedding
EMBEDDED_MODEL = create_embedding_model() if (config.EMBEDDING_NAME and config.ACTIVATE_INFERENCE) else None
# model for transformers generate
if config.ONLY_EMBEDDING:
GENERATE_MDDEL = None
VLLM_ENGINE = None
else:
GENERATE_MDDEL = create_generate_model() if (not config.USE_VLLM and config.ACTIVATE_INFERENCE) else None
# model for vllm generate
VLLM_ENGINE = create_vllm_engine() if (config.USE_VLLM and config.ACTIVATE_INFERENCE) else None
# model names for special processing
EXCLUDE_MODELS = ["baichuan-13b", "qwen"]
config.py 添加 SWAP_SPACE配置
import os
import dotenv
from loguru import logger
dotenv.load_dotenv()
DEFAULTS = {
'HOST': '0.0.0.0',
'PORT': 8000,
# support for model
'MODEL_NAME': '',
'MODEL_PATH': '',
'ADAPTER_MODEL_PATH': '',
'RESIZE_EMBEDDINGS': 'False',
# support for device
'DEVICE': 'cuda',
'DEVICE_MAP': "",
'GPUS': '',
'NUM_GPUs': 1,
# support for embeddings
'ONLY_EMBEDDING': 'False',
'EMBEDDING_NAME': '',
'EMBEDDING_SIZE': '',
'EMBEDDING_DEVICE': 'cuda',
# support for quantize
'QUANTIZE': 16,
'LOAD_IN_8BIT': 'False',
'LOAD_IN_4BIT': 'False',
'USING_PTUNING_V2': 'False',
# support for model input
'CONTEXT_LEN': '',
'STREAM_INTERVERL': 2,
'PROMPT_NAME': '',
'PATCH_TYPE': '',
'ALPHA': 'auto',
'API_PREFIX': '/v1',
# support for vllm
'USE_VLLM': 'False',
'TRUST_REMOTE_CODE': "False",
'TOKENIZE_MODE': "auto",
'TENSOR_PARALLEL_SIZE': 1,
'DTYPE': "half",
"GPU_MEMORY_UTILIZATION": 0.9,
"MAX_NUM_BATCHED_TOKENS": "",
"MAX_NUM_SEQS": 256,
"QUANTIZATION_METHOD": "",
**"SWAP_SPACE": 8,**
# support for transformers.TextIteratorStreamer
'USE_STREAMER_V2': 'False',
# support for api key check
'API_KEYS': '',
'ACTIVATE_INFERENCE': 'True',
}
def get_env(key):
return os.environ.get(key, DEFAULTS.get(key))
def get_bool_env(key):
return get_env(key).lower() == 'true'
class Config:
""" Configuration class. """
def __init__(self):
self.HOST = get_env('HOST')
self.PORT = int(get_env('PORT'))
self.MODEL_NAME = get_env('MODEL_NAME')
self.MODEL_PATH = get_env('MODEL_PATH')
self.ADAPTER_MODEL_PATH = get_env('ADAPTER_MODEL_PATH') if get_env('ADAPTER_MODEL_PATH') else None
self.RESIZE_EMBEDDINGS = get_bool_env('RESIZE_EMBEDDINGS')
self.DEVICE = get_env('DEVICE')
self.DEVICE_MAP = get_env('DEVICE_MAP') if get_env('DEVICE_MAP') else None
self.GPUS = get_env('GPUS')
self.NUM_GPUs = int(get_env('NUM_GPUs'))
self.ONLY_EMBEDDING = get_bool_env('ONLY_EMBEDDING')
self.EMBEDDING_NAME = get_env('EMBEDDING_NAME') if get_env('EMBEDDING_NAME') else None
self.EMBEDDING_SIZE = int(get_env('EMBEDDING_SIZE')) if get_env('EMBEDDING_SIZE') else None
self.EMBEDDING_DEVICE = get_env('EMBEDDING_DEVICE')
self.QUANTIZE = int(get_env('QUANTIZE'))
self.LOAD_IN_8BIT = get_bool_env('LOAD_IN_8BIT')
self.LOAD_IN_4BIT = get_bool_env('LOAD_IN_4BIT')
self.USING_PTUNING_V2 = get_bool_env('USING_PTUNING_V2')
self.CONTEXT_LEN = int(get_env('CONTEXT_LEN')) if get_env('CONTEXT_LEN') else None
self.STREAM_INTERVERL = int(get_env('STREAM_INTERVERL'))
self.PROMPT_NAME = get_env('PROMPT_NAME') if get_env('PROMPT_NAME') else None
self.PATCH_TYPE = get_env('PATCH_TYPE') if get_env('PATCH_TYPE') else None
self.ALPHA = get_env('ALPHA')
self.API_PREFIX = get_env('API_PREFIX')
self.USE_VLLM = get_bool_env('USE_VLLM')
self.TRUST_REMOTE_CODE = get_bool_env('TRUST_REMOTE_CODE')
self.TOKENIZE_MODE = get_env('TOKENIZE_MODE')
self.TENSOR_PARALLEL_SIZE = int(get_env('TENSOR_PARALLEL_SIZE'))
self.DTYPE = get_env('DTYPE')
self.GPU_MEMORY_UTILIZATION = float(get_env('GPU_MEMORY_UTILIZATION'))
self.MAX_NUM_BATCHED_TOKENS = int(get_env('MAX_NUM_BATCHED_TOKENS')) if get_env('MAX_NUM_BATCHED_TOKENS') else None
self.MAX_NUM_SEQS = int(get_env('MAX_NUM_SEQS'))
self.QUANTIZATION_METHOD = get_env('QUANTIZATION_METHOD') if get_env('QUANTIZATION_METHOD') else None
self.USE_STREAMER_V2 = get_bool_env('USE_STREAMER_V2')
self.API_KEYS = get_env('API_KEYS').split(',') if get_env('API_KEYS') else None
self.ACTIVATE_INFERENCE = get_bool_env('ACTIVATE_INFERENCE')
self.SWAP_SPACE = int(get_env('SWAP_SPACE'))
config = Config()
logger.debug(f"Config: {config.__dict__}")
if config.GPUS:
if len(config.GPUS.split(",")) < config.NUM_GPUs:
raise ValueError(
f"Larger --num_gpus ({config.NUM_GPUs}) than --gpus {config.GPUS}!"
)
os.environ["CUDA_VISIBLE_DEVICES"] = config.GPUS
Dependencies
# 请在此处粘贴依赖情况
# Please paste the dependencies here
运行日志或截图 | Runtime logs or screenshots
curl --location --request POST 'http://ip:8000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "Qwen-14B-Chat",
"stream": true,
"messages": [
{
"role": "user",
"content": "Elasticsearch自定义插件示例"
},
{
"role": "assistant",
"content": "在Elasticsearch中,我们可以自定义插件来扩展其功能。以下是一个简单的自定义插件示例,该插件可以在索引创建时自动添加一个新字段。\n\n首先,我们需要创建一个Java类,该类实现了`org.elasticsearch.plugin.Plugin`接口。这个接口只有一个方法,`java.lang.Object configure()`,在这个方法中,我们可以配置我们的插件。\n\n```javaimport org.elasticsearch.common.settings.Settings;\nimport org.elasticsearch.index.IndexSettings;\nimport org.elasticsearch.plugin.Plugin;\n\npublic class MyPlugin implements Plugin {\n\n @Override public void onPluginStart(Settings settings) {\n // 在插件启动时执行的代码 }\n\n @Override public void onPluginStop(Settings settings) {\n // 在插件停止时执行的代码 }\n\n @Override public void configure(Settings settings) {\n // 在插件配置时执行的代码 Settings indexSettings = Settings.builder()\n .put(IndexSettings.SETTING_NUMBER_OF_SHARDS,1)\n .put(IndexSettings.SETTING_NUMBER_OF_REPLICAS,0)\n .build();\n\n settings.put(\"myplugin.index\", indexSettings);\n }\n}\n```\n\n然后,我们需要创建一个`META-INF/services`目录,并在其中创建一个名为`org.elasticsearch.plugin.Plugin`的文件。在文件中,我们将我们的插件类的全限定名写入。\n\n```bash$ mkdir META-INF/services$ echo com.example.MyPlugin > META-INF/services/org.elasticsearch.plugin.Plugin```\n\n最后,我们需要将我们的插件类打包成一个jar文件,并将其放在Elasticsearch的`plugins`目录中。\n\n```bash$ jar cvf myplugin-1.0.jar MyPlugin.class$ cp myplugin-1.0.jar plugins/\n```\n\n现在,当我们启动Elasticsearch时,它将会使用我们的插件。我们可以在Elasticsearch的配置文件中看到我们在`configure`方法中设置的索引设置。\n\n注意:这只是一个非常简单的示例,实际的插件可能会更复杂。例如,你可能需要创建一个新的索引模板,或者在索引创建时运行一些脚本。"
},
{
"role": "user",
"content": "来个示例"
}
],
"temperature": 0.7,
"early_stopping": true
}'
100%可以复现
Alwin4Zhang commented
关联问题,其实是vllm的bug:vllm-project/vllm#1206
chi2liu commented
At present, we have found a workaround and set the swap space directly to 0. This way, we will not call the CPU swap space and will not report any errors. However, the CPU blocks will also become 0, which may slow down the speed a bit, but at least it will not hang and die.