diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index bde2200daf..2ed198e22f 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -20,7 +20,7 @@ ARG TORCH_BACKEND="cu128" # Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100: # "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'" ARG DEEPGEMM_REF="03d0be3" -ARG FLASHINF_REF="v0.2.8rc1" +ARG FLASHINF_REF="v0.2.8.rc1" # Define general architecture ARGs for supporting both x86 and aarch64 builds. # ARCH: Used for package suffixes (e.g., amd64, arm64) diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index f7cbdcf38f..f816363f38 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -167,14 +167,10 @@ python setup.py install # Install Flash Infer -if [ "$ARCH" = "arm64" ]; then - uv pip install flashinfer-python -else - cd $INSTALLATION_DIR - git clone https://github.com/flashinfer-ai/flashinfer.git --recursive - cd flashinfer - git checkout $FLASHINF_REF - python -m pip install -v . -fi +cd $INSTALLATION_DIR +git clone https://github.com/flashinfer-ai/flashinfer.git --recursive +cd flashinfer +git checkout $FLASHINF_REF +uv pip install -v . echo "vllm installation completed successfully"