Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/container-validation-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ jobs:
framework: vllm
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.11-cuda13.0-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.11-cuda13.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ matrix.platform.arch == 'arm64' && '130' || '' }}
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu130' || '' }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
Expand Down
21 changes: 11 additions & 10 deletions container/Dockerfile.vllm
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ARG BASE_IMAGE_TAG
ARG PYTHON_VERSION
ARG ENABLE_KVBM

ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG RUNTIME_IMAGE_TAG="25.11-cuda13.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="13.0"

# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.12.0"
Expand Down Expand Up @@ -188,8 +188,8 @@ COPY --from=base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
COPY --from=base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
RUN ln -s /usr/local/cuda/lib64/libcublas.so.12 /usr/local/cuda/lib64/libcublas.so
RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.12 /usr/local/cuda/lib64/libcublasLt.so
RUN ln -s /usr/local/cuda/lib64/libcublas.so.13 /usr/local/cuda/lib64/libcublas.so
RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.13 /usr/local/cuda/lib64/libcublasLt.so

# DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path
# is not properly set for complilation. Set CPATH to help nvcc find the headers.
Expand Down Expand Up @@ -234,9 +234,10 @@ RUN apt-get update && \
ninja-build \
g++ \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-9 && \
ca-certificates &&\
# DeepGemm uses 'cuobjdump' which does not come with vanillay CUDA image
# cuda-command-line-tools-12-9 && \
# cuda-cuobjdump-13-0 is pre-installed in nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-runtime-ubuntu24.04
rm -rf /var/lib/apt/lists/*

USER dynamo
Expand Down Expand Up @@ -272,12 +273,12 @@ COPY --chown=dynamo: benchmarks/ /workspace/benchmarks/
# Install dynamo, NIXL, and dynamo-specific dependencies
ARG ENABLE_KVBM
COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
RUN uv pip install --no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /workspace/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache .
Expand Down
16 changes: 8 additions & 8 deletions container/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG="25.11-cuda13.0-devel-ubuntu24.04"

NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
Expand Down Expand Up @@ -511,24 +511,24 @@ BUILD_ARGS+=" --build-arg DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA "
if [[ $FRAMEWORK == "VLLM" ]] && [[ "$PLATFORM" == *"linux/arm64"* ]]; then
# Set base image tag to CUDA 12.9 if using the default value (user didn't override)
if [ "$BASE_IMAGE_TAG" == "$VLLM_BASE_IMAGE_TAG" ]; then
BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
BASE_IMAGE_TAG="25.11-cuda13.0-devel-ubuntu24.04"
echo "INFO: Automatically setting base-image-tag to $BASE_IMAGE_TAG for vLLM ARM64"
fi

# Add required build args if not already present
if [[ "$BUILD_ARGS" != *"RUNTIME_IMAGE_TAG"* ]]; then
BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 "
echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 for vLLM ARM64"
BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=25.11-cuda13.0-runtime-ubuntu24.04 "
echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=25.11-cuda13.0-runtime-ubuntu24.04 for vLLM ARM64"
fi

if [[ "$BUILD_ARGS" != *"CUDA_VERSION"* ]]; then
BUILD_ARGS+=" --build-arg CUDA_VERSION=129 "
echo "INFO: Automatically setting CUDA_VERSION=129 for vLLM ARM64"
BUILD_ARGS+=" --build-arg CUDA_VERSION=130 "
echo "INFO: Automatically setting CUDA_VERSION=130 for vLLM ARM64"
fi

if [[ "$BUILD_ARGS" != *"TORCH_BACKEND"* ]]; then
BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu129 "
echo "INFO: Automatically setting TORCH_BACKEND=cu129 for vLLM ARM64"
BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu130 "
echo "INFO: Automatically setting TORCH_BACKEND=cu130 for vLLM ARM64"
fi

fi
Expand Down
71 changes: 42 additions & 29 deletions container/deps/vllm/install_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@
# 3. DeepGEMM
# 4. EP kernels

set -euo pipefail
set -euox pipefail

VLLM_REF="v0.12.0"
VLLM_VER="0.12.0"
VLLM_REF="v${VLLM_VER}"

# Basic Configurations
ARCH=$(uname -m)
MAX_JOBS=16
INSTALLATION_DIR=/tmp

# VLLM and Dependency Configurations
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- check if we need to add 12.0+PTX
DEEPGEMM_REF=""
CUDA_VERSION="12.9"
CUDA_VERSION="13.0"
FLASHINF_REF="v0.5.3"
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
LMCACHE_REF="0.3.10"
Expand Down Expand Up @@ -95,56 +96,68 @@ fi
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda

# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
# Derive torch backend from CUDA version (e.g., "13.0" -> "cu130")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"

echo "=== Installing prerequisites ==="
uv pip install pip cuda-python

uv pip install --no-cache pip cuda-python
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"

echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi

echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts
# Clone needed for DeepGEMM and EP kernels install scripts and to build from source on ARM64
cd $INSTALLATION_DIR
git clone https://github.com/vllm-project/vllm.git vllm
git clone https://github.com/vllm-project/vllm vllm
cd vllm
git checkout $VLLM_REF

echo "\n=== Installing vLLM & FlashInfer ==="
echo "Installing vLLM $VLLM_REF from PyPI..."
git checkout ${VLLM_REF}
# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped)
git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0
echo " vLLM repository cloned"

uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}

echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM & FlashInfer ==="
if [ "$ARCH" = "amd64" ]; then
echo "Installing vLLM $VLLM_REF from PyPI..."
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install \
--no-cache \
--index-strategy=unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
# lmcache==${LMCACHE_REF} \ # temp disable lmcache as it only supports CUDA 12 at this time
nixl[cu13]==0.7.1 \
https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer] \
--torch-backend=${TORCH_BACKEND}
# uv pip uninstall cupy-cuda12x # lmcache still lists cupy-cuda12x as dependency - uninstall it first
# uv pip --no-cache install cupy-cuda13x
uv pip install --no-cache flashinfer-cubin==$FLASHINF_REF
uv pip install --no-cache flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)"
echo "Building vLLM from source for ${ARCH} architecture..."
echo "Try to install specific PyTorch and other dependencies first"
uv pip install --no-cache --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt
uv pip install --no-cache setuptools_scm # required to build vLLM from source
MAX_JOBS=${MAX_JOBS} uv pip install -v --no-cache --no-build-isolation .
fi

echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools

if [ -n "$DEEPGEMM_REF" ]; then
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
UV_NO_CACHE=1 bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
UV_NO_CACHE=1 bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
fi
echo "✓ DeepGEMM installation completed"

echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
UV_NO_CACHE=1 TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh

echo "\n✅ All installations completed successfully!"
Loading