ai-dynamo · dmitry-tokarev-nv · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 8, 2025
@@ -154,10 +154,10 @@ jobs:
           framework: vllm
           target: runtime
           platform: 'linux/${{ matrix.platform.arch }}'
-          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
-          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
-          cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
-          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
+          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.11-cuda13.0-devel-ubuntu24.04' || '' }}
+          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.11-cuda13.0-runtime-ubuntu24.04' || '' }}
+          cuda_version: ${{ matrix.platform.arch == 'arm64' && '130' || '' }}
+          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu130' || '' }}
           ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
           ci_token: ${{ secrets.CI_TOKEN }}
           aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}

@@ -10,9 +10,9 @@ ARG BASE_IMAGE_TAG
 ARG PYTHON_VERSION
 ARG ENABLE_KVBM
 
-ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
-ARG CUDA_VERSION="12.9"
+ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base"
+ARG RUNTIME_IMAGE_TAG="25.11-cuda13.0-runtime-ubuntu24.04"
+ARG CUDA_VERSION="13.0"
 
 # Make sure to update the dependency version in pyproject.toml when updating this
 ARG VLLM_REF="v0.12.0"
@@ -188,8 +188,8 @@ COPY --from=base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
 COPY --from=base /usr/local/cuda/include/ /usr/local/cuda/include/
 COPY --from=base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
 COPY --from=base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
-RUN ln -s /usr/local/cuda/lib64/libcublas.so.12 /usr/local/cuda/lib64/libcublas.so
-RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.12 /usr/local/cuda/lib64/libcublasLt.so
+RUN ln -s /usr/local/cuda/lib64/libcublas.so.13 /usr/local/cuda/lib64/libcublas.so
+RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.13 /usr/local/cuda/lib64/libcublasLt.so
 
 # DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path
 # is not properly set for complilation. Set CPATH to help nvcc find the headers.
@@ -234,9 +234,10 @@ RUN apt-get update && \
         ninja-build \
         g++ \
         # prometheus dependencies
-        ca-certificates \
-        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
-        cuda-command-line-tools-12-9 && \
+        ca-certificates &&\
+        # DeepGemm uses 'cuobjdump' which does not come with vanillay CUDA image
+        # cuda-command-line-tools-12-9 && \
+        # cuda-cuobjdump-13-0 is pre-installed in nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-runtime-ubuntu24.04
     rm -rf /var/lib/apt/lists/*
 
 USER dynamo
@@ -272,12 +273,12 @@ COPY --chown=dynamo: benchmarks/ /workspace/benchmarks/
 # Install dynamo, NIXL, and dynamo-specific dependencies
 ARG ENABLE_KVBM
 COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
-RUN uv pip install \
+RUN uv pip install --no-cache \
       /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
       /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
       /opt/dynamo/wheelhouse/nixl/nixl*.whl \
     && if [ "${ENABLE_KVBM}" = "true" ]; then \
-        uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
+        uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
        fi \
     && cd /workspace/benchmarks \
     && UV_GIT_LFS=1 uv pip install --no-cache .

@@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.11-cuda13.0-devel-ubuntu24.04"
 
 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
@@ -511,24 +511,24 @@ BUILD_ARGS+=" --build-arg DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA "
 if [[ $FRAMEWORK == "VLLM" ]] && [[ "$PLATFORM" == *"linux/arm64"* ]]; then
     # Set base image tag to CUDA 12.9 if using the default value (user didn't override)
     if [ "$BASE_IMAGE_TAG" == "$VLLM_BASE_IMAGE_TAG" ]; then
-        BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
+        BASE_IMAGE_TAG="25.11-cuda13.0-devel-ubuntu24.04"
         echo "INFO: Automatically setting base-image-tag to $BASE_IMAGE_TAG for vLLM ARM64"
     fi
 
     # Add required build args if not already present
     if [[ "$BUILD_ARGS" != *"RUNTIME_IMAGE_TAG"* ]]; then
-        BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 "
-        echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 for vLLM ARM64"
+        BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=25.11-cuda13.0-runtime-ubuntu24.04 "
+        echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=25.11-cuda13.0-runtime-ubuntu24.04 for vLLM ARM64"
     fi
 
     if [[ "$BUILD_ARGS" != *"CUDA_VERSION"* ]]; then
-        BUILD_ARGS+=" --build-arg CUDA_VERSION=129 "
-        echo "INFO: Automatically setting CUDA_VERSION=129 for vLLM ARM64"
+        BUILD_ARGS+=" --build-arg CUDA_VERSION=130 "
+        echo "INFO: Automatically setting CUDA_VERSION=130 for vLLM ARM64"
     fi
 
     if [[ "$BUILD_ARGS" != *"TORCH_BACKEND"* ]]; then
-        BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu129 "
-        echo "INFO: Automatically setting TORCH_BACKEND=cu129 for vLLM ARM64"
+        BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu130 "
+        echo "INFO: Automatically setting TORCH_BACKEND=cu130 for vLLM ARM64"
     fi
 
 fi

@@ -9,19 +9,20 @@
 # 3. DeepGEMM
 # 4. EP kernels
 
-set -euo pipefail
+set -euox pipefail
 
-VLLM_REF="v0.12.0"
+VLLM_VER="0.12.0"
+VLLM_REF="v${VLLM_VER}"
 
 # Basic Configurations
 ARCH=$(uname -m)
 MAX_JOBS=16
 INSTALLATION_DIR=/tmp
 
 # VLLM and Dependency Configurations
-TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
+TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- check if we need to add 12.0+PTX
 DEEPGEMM_REF=""
-CUDA_VERSION="12.9"
+CUDA_VERSION="13.0"
 FLASHINF_REF="v0.5.3"
 # LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
 LMCACHE_REF="0.3.10"
@@ -95,56 +96,68 @@ fi
 export MAX_JOBS=$MAX_JOBS
 export CUDA_HOME=/usr/local/cuda
 
-# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
+# Derive torch backend from CUDA version (e.g., "13.0" -> "cu130")
 TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
 
 echo "=== Installing prerequisites ==="
-uv pip install pip cuda-python
-
+uv pip install --no-cache pip cuda-python
 echo "\n=== Configuration Summary ==="
 echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
 echo "  FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
 echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
 
-echo "\n=== Installing LMCache ==="
-if [ "$ARCH" = "amd64" ]; then
-    # LMCache installation currently fails on arm64 due to CUDA dependency issues
-    # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
-    uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
-    echo "✓ LMCache ${LMCACHE_REF} installed"
-else
-    echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
-fi
 
 echo "\n=== Cloning vLLM repository ==="
-# Clone needed for DeepGEMM and EP kernels install scripts
+# Clone needed for DeepGEMM and EP kernels install scripts and to build from source on ARM64
 cd $INSTALLATION_DIR
-git clone https://github.com/vllm-project/vllm.git vllm
+git clone https://github.com/vllm-project/vllm vllm
 cd vllm
-git checkout $VLLM_REF
-
-echo "\n=== Installing vLLM & FlashInfer ==="
-echo "Installing vLLM $VLLM_REF from PyPI..."
+git checkout ${VLLM_REF}
+# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped)
+git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0
+echo "✓ vLLM repository cloned"
 
-uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
-uv pip install flashinfer-cubin==$FLASHINF_REF
-uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
 
-echo "✓ vLLM installation completed"
+echo "\n=== Installing vLLM & FlashInfer ==="
+if [ "$ARCH" = "amd64" ]; then
+    echo "Installing vLLM $VLLM_REF from PyPI..."
+    # LMCache installation currently fails on arm64 due to CUDA dependency issues
+    # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
+    uv pip install \
+        --no-cache \
+        --index-strategy=unsafe-best-match \
+        --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
+        # lmcache==${LMCACHE_REF} \ # temp disable lmcache as it only supports CUDA 12 at this time
+        nixl[cu13]==0.7.1 \
+        https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer] \
+        --torch-backend=${TORCH_BACKEND}
+    # uv pip uninstall cupy-cuda12x # lmcache still lists cupy-cuda12x as dependency - uninstall it first
+    # uv pip --no-cache install cupy-cuda13x
+    uv pip install --no-cache flashinfer-cubin==$FLASHINF_REF
+    uv pip install --no-cache flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
+    echo "✓ vLLM installation completed"
+else
+    echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)"
+    echo "Building vLLM from source for ${ARCH} architecture..."
+    echo "Try to install specific PyTorch and other dependencies first"
+    uv pip install --no-cache --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt
+    uv pip install --no-cache setuptools_scm # required to build vLLM from source
+    MAX_JOBS=${MAX_JOBS} uv pip install -v --no-cache --no-build-isolation .
+fi
 
 echo "\n=== Installing DeepGEMM ==="
 cd $INSTALLATION_DIR/vllm/tools
 
 if [ -n "$DEEPGEMM_REF" ]; then
-    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
+    UV_NO_CACHE=1 bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
 else
-    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
+    UV_NO_CACHE=1 bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
 fi
 echo "✓ DeepGEMM installation completed"
 
 echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
 cd ep_kernels/
 # TODO we will be able to specify which pplx and deepep commit we want in future
-TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
+UV_NO_CACHE=1 TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
 
 echo "\n✅ All installations completed successfully!"