安装
安装依赖
sudo apt-get update -y
sudo apt-get install -y gcc-12 g++-12 libnuma-dev
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
创建虚拟环境
uv venv --python 3.12 --seed
source .venv/bin/activate
克隆 vLLM 项目
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
安装所需依赖
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
构建并安装 vLLM
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
兼容 OpenAI 的服务器
vllm serve Qwen/Qwen3-1.7B
# 不设置前缀模型
vllm serve
报错
(EngineCore_DP0 pid=2829) ERROR 01-30 05:44:14 [core.py:946] ValueError: To serve at least one request with the models's max seq len (40960), (4.38 GiB KV cache is needed, which is larger than the available KV cache memory (3.88 GiB). Based on the available memory, the estimated maximum model length is 36224. Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine. See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ for more details.
解决
export VLLM_CPU_KVCACHE_SPACE=1 # 单位 GiB,试 1 或 0.5
vllm serve Qwen/Qwen3-1.7B --max-model-len 2096 --dtype bfloat16 --gpu-memory-utilization 0.5
--gpu-memory-utilization 0.5 # 这个在 CPU 模式下也影响预分配逻辑,设低点`
export VLLM_USE_MODELSCOPE=True
安装前置依赖
uv pip install modelscope