xusenlinzy/api-for-open-llm

通义千问Qwen-14B-Chat封装接口 kv cache 一直卡在94.1%,长时间无响应,然后其他服务进不来

Alwin4Zhang opened this issue · 2 comments

提交前必须检查以下项目 | The following items must be checked before submission

  • 请确保使用的是仓库最新代码(git pull),一些问题已被解决和修复。 | Make sure you are using the latest code from the repository (git pull), some issues have already been addressed and fixed.
  • 我已阅读项目文档FAQ章节并且已在Issue中对问题进行了搜索,没有找到相似问题和解决方案 | I have searched the existing issues / discussions

问题类型 | Type of problem

模型推理和部署 | Model inference and deployment

操作系统 | Operating system

Linux

详细描述问题 | Detailed description of the problem

models.py 添加SWAP_SPACE=8的设置

import asyncio

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from sentence_transformers import SentenceTransformer

from api.apapter import get_prompt_adapter
from api.config import config


def create_app():
    """ create fastapi app server """
    app = FastAPI()
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    return app


def create_embedding_model() -> SentenceTransformer:
    """ get embedding model from sentence-transformers. """
    return SentenceTransformer(config.EMBEDDING_NAME, device=config.EMBEDDING_DEVICE)


def create_generate_model():
    """ get generate model for chat or completion. """
    from api.generation import ModelServer
    from api.apapter.model import load_model

    if config.PATCH_TYPE == "attention":
        from api.utils.patches import apply_attention_patch

        apply_attention_patch(use_memory_efficient_attention=True)
    if config.PATCH_TYPE == "ntk":
        from api.utils.patches import apply_ntk_scaling_patch

        apply_ntk_scaling_patch(config.ALPHA)

    model, tokenizer = load_model(
        config.MODEL_NAME,
        model_name_or_path=config.MODEL_PATH,
        adapter_model=config.ADAPTER_MODEL_PATH,
        quantize=config.QUANTIZE,
        device=config.DEVICE,
        device_map=config.DEVICE_MAP,
        num_gpus=config.NUM_GPUs,
        load_in_8bit=config.LOAD_IN_8BIT,
        load_in_4bit=config.LOAD_IN_4BIT,
        use_ptuning_v2=config.USING_PTUNING_V2,
        dtype=config.DTYPE,
        resize_embeddings=config.RESIZE_EMBEDDINGS,
    )

    return ModelServer(
        model,
        tokenizer,
        config.DEVICE,
        model_name=config.MODEL_NAME,
        context_len=config.CONTEXT_LEN,
        stream_interval=config.STREAM_INTERVERL,
        prompt_name=config.PROMPT_NAME,
        use_streamer_v2=config.USE_STREAMER_V2,
    )


def get_context_len(model_config) -> int:
    """ fix for model max length. """
    if "qwen" in config.MODEL_NAME.lower():
        max_model_len = config.CONTEXT_LEN or 8192
    else:
        max_model_len = config.CONTEXT_LEN or model_config.max_model_len
    return max_model_len


def create_vllm_engine():
    """ get vllm generate engine for chat or completion. """
    try:
        from vllm.engine.arg_utils import AsyncEngineArgs
        from vllm.engine.async_llm_engine import AsyncLLMEngine
        from vllm.transformers_utils.tokenizer import get_tokenizer
    except ImportError:
        return None

    engine_args = AsyncEngineArgs(
        model=config.MODEL_PATH,
        tokenizer_mode=config.TOKENIZE_MODE,
        trust_remote_code=config.TRUST_REMOTE_CODE,
        dtype=config.DTYPE,
        tensor_parallel_size=config.TENSOR_PARALLEL_SIZE,
        gpu_memory_utilization=config.GPU_MEMORY_UTILIZATION,
        **swap_space = config.SWAP_SPACE,**
        max_num_batched_tokens=config.MAX_NUM_BATCHED_TOKENS,
        max_num_seqs=config.MAX_NUM_SEQS,
        max_model_len=config.CONTEXT_LEN,
        quantization=config.QUANTIZATION_METHOD,
    )
    engine = AsyncLLMEngine.from_engine_args(engine_args)

    # A separate tokenizer to map token IDs to strings.
    engine.engine.tokenizer = get_tokenizer(
        engine_args.tokenizer,
        tokenizer_mode=engine_args.tokenizer_mode,
        trust_remote_code=True,
    )

    # prompt adapter for constructing model inputs
    engine.prompt_adapter = get_prompt_adapter(
        config.MODEL_NAME.lower(),
        prompt_name=config.PROMPT_NAME.lower() if config.PROMPT_NAME else None
    )

    engine_model_config = asyncio.run(engine.get_model_config())
    engine.engine.scheduler_config.max_model_len = get_context_len(engine_model_config)
    engine.max_model_len = get_context_len(engine_model_config)

    return engine


# fastapi app
app = create_app()

# model for embedding
EMBEDDED_MODEL = create_embedding_model() if (config.EMBEDDING_NAME and config.ACTIVATE_INFERENCE) else None

# model for transformers generate
if config.ONLY_EMBEDDING:
    GENERATE_MDDEL = None
    VLLM_ENGINE = None
else:
    GENERATE_MDDEL = create_generate_model() if (not config.USE_VLLM and config.ACTIVATE_INFERENCE) else None
    # model for vllm generate
    VLLM_ENGINE = create_vllm_engine() if (config.USE_VLLM and config.ACTIVATE_INFERENCE) else None

# model names for special processing
EXCLUDE_MODELS = ["baichuan-13b", "qwen"]

config.py 添加 SWAP_SPACE配置
import os

import dotenv
from loguru import logger

dotenv.load_dotenv()


DEFAULTS = {
    'HOST': '0.0.0.0',
    'PORT': 8000,

    # support for model
    'MODEL_NAME': '',
    'MODEL_PATH': '',
    'ADAPTER_MODEL_PATH': '',
    'RESIZE_EMBEDDINGS': 'False',

    # support for device
    'DEVICE': 'cuda',
    'DEVICE_MAP': "",
    'GPUS': '',
    'NUM_GPUs': 1,

    # support for embeddings
    'ONLY_EMBEDDING': 'False',
    'EMBEDDING_NAME': '',
    'EMBEDDING_SIZE': '',
    'EMBEDDING_DEVICE': 'cuda',

    # support for quantize
    'QUANTIZE': 16,
    'LOAD_IN_8BIT': 'False',
    'LOAD_IN_4BIT': 'False',
    'USING_PTUNING_V2': 'False',

    # support for model input
    'CONTEXT_LEN': '',
    'STREAM_INTERVERL': 2,
    'PROMPT_NAME': '',

    'PATCH_TYPE': '',
    'ALPHA': 'auto',

    'API_PREFIX': '/v1',

    # support for vllm
    'USE_VLLM': 'False',
    'TRUST_REMOTE_CODE': "False",
    'TOKENIZE_MODE': "auto",
    'TENSOR_PARALLEL_SIZE': 1,
    'DTYPE': "half",
    "GPU_MEMORY_UTILIZATION": 0.9,
    "MAX_NUM_BATCHED_TOKENS": "",
    "MAX_NUM_SEQS": 256,
    "QUANTIZATION_METHOD": "",
    **"SWAP_SPACE": 8,**

    # support for transformers.TextIteratorStreamer
    'USE_STREAMER_V2': 'False',

    # support for api key check
    'API_KEYS': '',

    'ACTIVATE_INFERENCE': 'True',
}


def get_env(key):
    return os.environ.get(key, DEFAULTS.get(key))


def get_bool_env(key):
    return get_env(key).lower() == 'true'


class Config:
    """ Configuration class. """

    def __init__(self):
        self.HOST = get_env('HOST')
        self.PORT = int(get_env('PORT'))

        self.MODEL_NAME = get_env('MODEL_NAME')
        self.MODEL_PATH = get_env('MODEL_PATH')
        self.ADAPTER_MODEL_PATH = get_env('ADAPTER_MODEL_PATH') if get_env('ADAPTER_MODEL_PATH') else None
        self.RESIZE_EMBEDDINGS = get_bool_env('RESIZE_EMBEDDINGS')

        self.DEVICE = get_env('DEVICE')
        self.DEVICE_MAP = get_env('DEVICE_MAP') if get_env('DEVICE_MAP') else None
        self.GPUS = get_env('GPUS')
        self.NUM_GPUs = int(get_env('NUM_GPUs'))

        self.ONLY_EMBEDDING = get_bool_env('ONLY_EMBEDDING')
        self.EMBEDDING_NAME = get_env('EMBEDDING_NAME') if get_env('EMBEDDING_NAME') else None
        self.EMBEDDING_SIZE = int(get_env('EMBEDDING_SIZE')) if get_env('EMBEDDING_SIZE') else None
        self.EMBEDDING_DEVICE = get_env('EMBEDDING_DEVICE')

        self.QUANTIZE = int(get_env('QUANTIZE'))
        self.LOAD_IN_8BIT = get_bool_env('LOAD_IN_8BIT')
        self.LOAD_IN_4BIT = get_bool_env('LOAD_IN_4BIT')
        self.USING_PTUNING_V2 = get_bool_env('USING_PTUNING_V2')

        self.CONTEXT_LEN = int(get_env('CONTEXT_LEN')) if get_env('CONTEXT_LEN') else None
        self.STREAM_INTERVERL = int(get_env('STREAM_INTERVERL'))
        self.PROMPT_NAME = get_env('PROMPT_NAME') if get_env('PROMPT_NAME') else None

        self.PATCH_TYPE = get_env('PATCH_TYPE') if get_env('PATCH_TYPE') else None
        self.ALPHA = get_env('ALPHA')

        self.API_PREFIX = get_env('API_PREFIX')

        self.USE_VLLM = get_bool_env('USE_VLLM')
        self.TRUST_REMOTE_CODE = get_bool_env('TRUST_REMOTE_CODE')
        self.TOKENIZE_MODE = get_env('TOKENIZE_MODE')
        self.TENSOR_PARALLEL_SIZE = int(get_env('TENSOR_PARALLEL_SIZE'))
        self.DTYPE = get_env('DTYPE')
        self.GPU_MEMORY_UTILIZATION = float(get_env('GPU_MEMORY_UTILIZATION'))
        self.MAX_NUM_BATCHED_TOKENS = int(get_env('MAX_NUM_BATCHED_TOKENS')) if get_env('MAX_NUM_BATCHED_TOKENS') else None
        self.MAX_NUM_SEQS = int(get_env('MAX_NUM_SEQS'))
        self.QUANTIZATION_METHOD = get_env('QUANTIZATION_METHOD') if get_env('QUANTIZATION_METHOD') else None

        self.USE_STREAMER_V2 = get_bool_env('USE_STREAMER_V2')

        self.API_KEYS = get_env('API_KEYS').split(',') if get_env('API_KEYS') else None

        self.ACTIVATE_INFERENCE = get_bool_env('ACTIVATE_INFERENCE')
        

        self.SWAP_SPACE = int(get_env('SWAP_SPACE'))


config = Config()
logger.debug(f"Config: {config.__dict__}")
if config.GPUS:
    if len(config.GPUS.split(",")) < config.NUM_GPUs:
        raise ValueError(
            f"Larger --num_gpus ({config.NUM_GPUs}) than --gpus {config.GPUS}!"
        )
    os.environ["CUDA_VISIBLE_DEVICES"] = config.GPUS

Dependencies

# 请在此处粘贴依赖情况
# Please paste the dependencies here

运行日志或截图 | Runtime logs or screenshots

curl --location --request POST 'http://ip:8000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
    "model": "Qwen-14B-Chat",
    "stream": true,
    "messages": [
        {
            "role": "user",
            "content": "Elasticsearch自定义插件示例"
        },
        {
            "role": "assistant",
            "content": "在Elasticsearch中,我们可以自定义插件来扩展其功能。以下是一个简单的自定义插件示例,该插件可以在索引创建时自动添加一个新字段。\n\n首先,我们需要创建一个Java类,该类实现了`org.elasticsearch.plugin.Plugin`接口。这个接口只有一个方法,`java.lang.Object configure()`,在这个方法中,我们可以配置我们的插件。\n\n```javaimport org.elasticsearch.common.settings.Settings;\nimport org.elasticsearch.index.IndexSettings;\nimport org.elasticsearch.plugin.Plugin;\n\npublic class MyPlugin implements Plugin {\n\n @Override public void onPluginStart(Settings settings) {\n // 在插件启动时执行的代码 }\n\n @Override public void onPluginStop(Settings settings) {\n // 在插件停止时执行的代码 }\n\n @Override public void configure(Settings settings) {\n // 在插件配置时执行的代码 Settings indexSettings = Settings.builder()\n .put(IndexSettings.SETTING_NUMBER_OF_SHARDS,1)\n .put(IndexSettings.SETTING_NUMBER_OF_REPLICAS,0)\n .build();\n\n settings.put(\"myplugin.index\", indexSettings);\n }\n}\n```\n\n然后,我们需要创建一个`META-INF/services`目录,并在其中创建一个名为`org.elasticsearch.plugin.Plugin`的文件。在文件中,我们将我们的插件类的全限定名写入。\n\n```bash$ mkdir META-INF/services$ echo com.example.MyPlugin > META-INF/services/org.elasticsearch.plugin.Plugin```\n\n最后,我们需要将我们的插件类打包成一个jar文件,并将其放在Elasticsearch的`plugins`目录中。\n\n```bash$ jar cvf myplugin-1.0.jar MyPlugin.class$ cp myplugin-1.0.jar plugins/\n```\n\n现在,当我们启动Elasticsearch时,它将会使用我们的插件。我们可以在Elasticsearch的配置文件中看到我们在`configure`方法中设置的索引设置。\n\n注意:这只是一个非常简单的示例,实际的插件可能会更复杂。例如,你可能需要创建一个新的索引模板,或者在索引创建时运行一些脚本。"
        },
        {
            "role": "user",
            "content": "来个示例"
        }
    ],
    "temperature": 0.7,
    "early_stopping": true
}'

100%可以复现

修改默认SWAP_SPACE参数为8后,接口响应正常
image

关联问题,其实是vllm的bug:vllm-project/vllm#1206

At present, we have found a workaround and set the swap space directly to 0. This way, we will not call the CPU swap space and will not report any errors. However, the CPU blocks will also become 0, which may slow down the speed a bit, but at least it will not hang and die.