A library for validating and benchmarking LLMs inference.
python3 scalellm_run_benchmark.py --input_file /data/dataset/F_alpaca_group_10_2.json --model_dir=/data/llama-2-7b-hf --batch_size=16
- --input_file input json file
- --model_dir model directory
- --batch_size running batchsize
- --data_format v1 or v2, indicate different input json format
python3 vllm_run_benchmark.py --input_file /data/dataset/Chatbot_group_10_2.json --model_dir=/data/llama-2-7b-hf --batch_size=16
- --input_file input json file
- --model_dir model directory
- --batch_size running batchsize
- --data_format v1 or v2, indicate different input json format
git clone https://github.com/NVIDIA/TensorRT-LLM.git
python TensorRT-LLM/examples/qwen/convert_checkpoint.py --model_dir /data/qwen-7b --output_dir /data/qwen-7b-ckpt --dtype float16
- --workers parallel number (tensor parallel number)
- --model_dir huggingface model directory
- --dtype type
- --output_dir output checkpoint directory
trtllm-build --checkpoint_dir /data/qwen-7b-ckpt --gemm_plugin float16 --use_gemm_plugin float16 --use_gpt_attention_plugin float16 --max_batch_size 256 --output_dir /data/qwen-7b-engine
- --max_batch_size batch_size
- --max_input_len input length
- --max_output_len output length
- --output_dir output directory
- --checkpoint_dir ckpt directory
- --workers parallel number (tensor parallel number)
python3 tensorrtllm_run_benchmark.py --max_output_len=100 --tokenizer_dir /data/llama-2-7b-hf --engine_dir /data/llama-2-7b-engine --input_file /data/dataset/Chatbot_group_10_2.json --batch_size 16
mpirun -n 2 python run.py --max_output_len=100 --every_batch_cost_print True --tokenizer_dir /data/tensorrtllm_test/opt-13b/ --engine_dir /data/tensorrtllm_test/opt-13b-trtllm-build/ --input_file /data/opt-13b-test/Chatbot_group_10.json --batch_size 8