diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index c8af077259..2988ddf13e 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -154,10 +154,10 @@ jobs: framework: vllm target: runtime platform: 'linux/${{ matrix.platform.arch }}' - base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }} - runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }} - cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }} - torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }} + base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.11-cuda13.0-devel-ubuntu24.04' || '' }} + runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.11-cuda13.0-runtime-ubuntu24.04' || '' }} + cuda_version: ${{ matrix.platform.arch == 'arm64' && '130' || '' }} + torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu130' || '' }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index ea76a01351..4fc0188885 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -10,9 +10,9 @@ ARG BASE_IMAGE_TAG ARG PYTHON_VERSION ARG ENABLE_KVBM -ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" -ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04" -ARG CUDA_VERSION="12.9" +ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda-dl-base" +ARG RUNTIME_IMAGE_TAG="25.11-cuda13.0-runtime-ubuntu24.04" +ARG CUDA_VERSION="13.0" # Make sure to update the dependency version in pyproject.toml when updating this ARG VLLM_REF="v0.12.0" @@ -188,8 +188,8 @@ COPY --from=base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary COPY --from=base /usr/local/cuda/include/ /usr/local/cuda/include/ COPY --from=base /usr/local/cuda/nvvm /usr/local/cuda/nvvm COPY --from=base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ -RUN ln -s /usr/local/cuda/lib64/libcublas.so.12 /usr/local/cuda/lib64/libcublas.so -RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.12 /usr/local/cuda/lib64/libcublasLt.so +RUN ln -s /usr/local/cuda/lib64/libcublas.so.13 /usr/local/cuda/lib64/libcublas.so +RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.13 /usr/local/cuda/lib64/libcublasLt.so # DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path # is not properly set for complilation. Set CPATH to help nvcc find the headers. @@ -234,9 +234,10 @@ RUN apt-get update && \ ninja-build \ g++ \ # prometheus dependencies - ca-certificates \ - # DeepGemm uses 'cuobjdump' which does not come with CUDA image - cuda-command-line-tools-12-9 && \ + ca-certificates &&\ + # DeepGemm uses 'cuobjdump' which does not come with vanillay CUDA image + # cuda-command-line-tools-12-9 && \ + # cuda-cuobjdump-13-0 is pre-installed in nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-runtime-ubuntu24.04 rm -rf /var/lib/apt/lists/* USER dynamo @@ -272,12 +273,12 @@ COPY --chown=dynamo: benchmarks/ /workspace/benchmarks/ # Install dynamo, NIXL, and dynamo-specific dependencies ARG ENABLE_KVBM COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/ -RUN uv pip install \ +RUN uv pip install --no-cache \ /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \ /opt/dynamo/wheelhouse/ai_dynamo*any.whl \ /opt/dynamo/wheelhouse/nixl/nixl*.whl \ && if [ "${ENABLE_KVBM}" = "true" ]; then \ - uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \ + uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \ fi \ && cd /workspace/benchmarks \ && UV_GIT_LFS=1 uv pip install --no-cache . diff --git a/container/build.sh b/container/build.sh index dffa85d892..de261c819c 100755 --- a/container/build.sh +++ b/container/build.sh @@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" # Please check https://github.com/ai-dynamo/dynamo/pull/1065 # for details and reproducer to manually test if the image # can be updated to later versions. -VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" +VLLM_BASE_IMAGE_TAG="25.11-cuda13.0-devel-ubuntu24.04" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" @@ -511,24 +511,24 @@ BUILD_ARGS+=" --build-arg DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA " if [[ $FRAMEWORK == "VLLM" ]] && [[ "$PLATFORM" == *"linux/arm64"* ]]; then # Set base image tag to CUDA 12.9 if using the default value (user didn't override) if [ "$BASE_IMAGE_TAG" == "$VLLM_BASE_IMAGE_TAG" ]; then - BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04" + BASE_IMAGE_TAG="25.11-cuda13.0-devel-ubuntu24.04" echo "INFO: Automatically setting base-image-tag to $BASE_IMAGE_TAG for vLLM ARM64" fi # Add required build args if not already present if [[ "$BUILD_ARGS" != *"RUNTIME_IMAGE_TAG"* ]]; then - BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 " - echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 for vLLM ARM64" + BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=25.11-cuda13.0-runtime-ubuntu24.04 " + echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=25.11-cuda13.0-runtime-ubuntu24.04 for vLLM ARM64" fi if [[ "$BUILD_ARGS" != *"CUDA_VERSION"* ]]; then - BUILD_ARGS+=" --build-arg CUDA_VERSION=129 " - echo "INFO: Automatically setting CUDA_VERSION=129 for vLLM ARM64" + BUILD_ARGS+=" --build-arg CUDA_VERSION=130 " + echo "INFO: Automatically setting CUDA_VERSION=130 for vLLM ARM64" fi if [[ "$BUILD_ARGS" != *"TORCH_BACKEND"* ]]; then - BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu129 " - echo "INFO: Automatically setting TORCH_BACKEND=cu129 for vLLM ARM64" + BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu130 " + echo "INFO: Automatically setting TORCH_BACKEND=cu130 for vLLM ARM64" fi fi diff --git a/container/deps/vllm/install_vllm.sh b/container/deps/vllm/install_vllm.sh index 8365deecf6..8beb8ac370 100755 --- a/container/deps/vllm/install_vllm.sh +++ b/container/deps/vllm/install_vllm.sh @@ -9,9 +9,10 @@ # 3. DeepGEMM # 4. EP kernels -set -euo pipefail +set -euox pipefail -VLLM_REF="v0.12.0" +VLLM_VER="0.12.0" +VLLM_REF="v${VLLM_VER}" # Basic Configurations ARCH=$(uname -m) @@ -19,9 +20,9 @@ MAX_JOBS=16 INSTALLATION_DIR=/tmp # VLLM and Dependency Configurations -TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels +TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- check if we need to add 12.0+PTX DEEPGEMM_REF="" -CUDA_VERSION="12.9" +CUDA_VERSION="13.0" FLASHINF_REF="v0.5.3" # LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility LMCACHE_REF="0.3.10" @@ -95,56 +96,68 @@ fi export MAX_JOBS=$MAX_JOBS export CUDA_HOME=/usr/local/cuda -# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129") +# Derive torch backend from CUDA version (e.g., "13.0" -> "cu130") TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')" echo "=== Installing prerequisites ===" -uv pip install pip cuda-python - +uv pip install --no-cache pip cuda-python echo "\n=== Configuration Summary ===" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND" echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF" echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR" -echo "\n=== Installing LMCache ===" -if [ "$ARCH" = "amd64" ]; then - # LMCache installation currently fails on arm64 due to CUDA dependency issues - # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence - uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND} - echo "✓ LMCache ${LMCACHE_REF} installed" -else - echo "⚠ Skipping LMCache on ARM64 (compatibility issues)" -fi echo "\n=== Cloning vLLM repository ===" -# Clone needed for DeepGEMM and EP kernels install scripts +# Clone needed for DeepGEMM and EP kernels install scripts and to build from source on ARM64 cd $INSTALLATION_DIR -git clone https://github.com/vllm-project/vllm.git vllm +git clone https://github.com/vllm-project/vllm vllm cd vllm -git checkout $VLLM_REF - -echo "\n=== Installing vLLM & FlashInfer ===" -echo "Installing vLLM $VLLM_REF from PyPI..." +git checkout ${VLLM_REF} +# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped) +git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0 +echo "✓ vLLM repository cloned" -uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=${TORCH_BACKEND} -uv pip install flashinfer-cubin==$FLASHINF_REF -uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND} -echo "✓ vLLM installation completed" +echo "\n=== Installing vLLM & FlashInfer ===" +if [ "$ARCH" = "amd64" ]; then + echo "Installing vLLM $VLLM_REF from PyPI..." + # LMCache installation currently fails on arm64 due to CUDA dependency issues + # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence + uv pip install \ + --no-cache \ + --index-strategy=unsafe-best-match \ + --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \ + # lmcache==${LMCACHE_REF} \ # temp disable lmcache as it only supports CUDA 12 at this time + nixl[cu13]==0.7.1 \ + https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer] \ + --torch-backend=${TORCH_BACKEND} + # uv pip uninstall cupy-cuda12x # lmcache still lists cupy-cuda12x as dependency - uninstall it first + # uv pip --no-cache install cupy-cuda13x + uv pip install --no-cache flashinfer-cubin==$FLASHINF_REF + uv pip install --no-cache flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND} + echo "✓ vLLM installation completed" +else + echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)" + echo "Building vLLM from source for ${ARCH} architecture..." + echo "Try to install specific PyTorch and other dependencies first" + uv pip install --no-cache --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt + uv pip install --no-cache setuptools_scm # required to build vLLM from source + MAX_JOBS=${MAX_JOBS} uv pip install -v --no-cache --no-build-isolation . +fi echo "\n=== Installing DeepGEMM ===" cd $INSTALLATION_DIR/vllm/tools if [ -n "$DEEPGEMM_REF" ]; then - bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF" + UV_NO_CACHE=1 bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF" else - bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" + UV_NO_CACHE=1 bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" fi echo "✓ DeepGEMM installation completed" echo "\n=== Installing EP Kernels (PPLX and DeepEP) ===" cd ep_kernels/ # TODO we will be able to specify which pplx and deepep commit we want in future -TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh +UV_NO_CACHE=1 TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh echo "\n✅ All installations completed successfully!"