diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 596033854e7..a95e5ddf576 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,4 +1,15 @@ name: Release Docker Images +# +# This workflow builds and publishes both framework and runtime Docker images: +# +# Framework images (full development environment): +# - lmsysorg/sglang:v{version}, lmsysorg/sglang:latest +# - lmsysorg/sglang:v{version}-cu129-{amd64,arm64} +# +# Runtime images (production-optimized, ~50% smaller): +# - lmsysorg/sglang:v{version}-runtime, lmsysorg/sglang:latest-runtime +# - lmsysorg/sglang:v{version}-cu129-{amd64,arm64}-runtime +# on: push: branches: @@ -45,12 +56,13 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and Push AMD64 + - name: Build and Push AMD64 Framework run: | version=$(cat python/sglang/version.py | cut -d'"' -f2) tag=v${version}-cu129-amd64 docker buildx build \ + --target framework \ --platform linux/amd64 \ --push \ -f docker/Dockerfile \ @@ -61,6 +73,40 @@ jobs: --no-cache \ . + - name: Build and Push AMD64 Runtime + run: | + version=$(cat python/sglang/version.py | cut -d'"' -f2) + tag=v${version}-cu129-amd64-runtime + + docker buildx build \ + --target runtime \ + --platform linux/amd64 \ + --push \ + -f docker/Dockerfile \ + --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \ + --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \ + --build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \ + -t lmsysorg/sglang:${tag} \ + --no-cache \ + . + + - name: Build and Push AMD64 Runtime (CUDA 13) + run: | + version=$(cat python/sglang/version.py | cut -d'"' -f2) + tag=v${version}-cu130-amd64-runtime + + docker buildx build \ + --target runtime \ + --platform linux/amd64 \ + --push \ + -f docker/Dockerfile \ + --build-arg CUDA_VERSION=13.0.1 \ + --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \ + --build-arg GRACE_BLACKWELL=0 \ + -t lmsysorg/sglang:${tag} \ + --no-cache \ + . + publish-arm64: if: github.repository == 'sgl-project/sglang' environment: "prod" @@ -87,12 +133,30 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and Push ARM64 + - name: Build and Push ARM64 Framework run: | version=$(cat python/sglang/version.py | cut -d'"' -f2) tag=v${version}-cu129-arm64 docker buildx build \ + --target framework \ + --platform linux/arm64 \ + --push \ + -f docker/Dockerfile \ + --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \ + --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \ + --build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \ + -t lmsysorg/sglang:${tag} \ + --no-cache \ + . + + - name: Build and Push ARM64 Runtime + run: | + version=$(cat python/sglang/version.py | cut -d'"' -f2) + tag=v${version}-cu129-arm64-runtime + + docker buildx build \ + --target runtime \ --platform linux/arm64 \ --push \ -f docker/Dockerfile \ @@ -103,6 +167,23 @@ jobs: --no-cache \ . + - name: Build and Push ARM64 Runtime (CUDA 13) + run: | + version=$(cat python/sglang/version.py | cut -d'"' -f2) + tag=v${version}-cu130-arm64-runtime + + docker buildx build \ + --target runtime \ + --platform linux/arm64 \ + --push \ + -f docker/Dockerfile \ + --build-arg CUDA_VERSION=13.0.1 \ + --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \ + --build-arg GRACE_BLACKWELL=1 \ + -t lmsysorg/sglang:${tag} \ + --no-cache \ + . + create-manifests: runs-on: ubuntu-22.04 needs: [publish-x86, publish-arm64] @@ -125,14 +206,38 @@ jobs: run: | version=$(cat python/sglang/version.py | cut -d'"' -f2) - # Create versioned manifest + # Create versioned framework manifest (default) docker buildx imagetools create \ -t lmsysorg/sglang:v${version} \ lmsysorg/sglang:v${version}-cu129-amd64 \ lmsysorg/sglang:v${version}-cu129-arm64 - # Create latest manifest + # Create latest framework manifest (default) docker buildx imagetools create \ -t lmsysorg/sglang:latest \ lmsysorg/sglang:v${version}-cu129-amd64 \ lmsysorg/sglang:v${version}-cu129-arm64 + + # Create versioned runtime manifest + docker buildx imagetools create \ + -t lmsysorg/sglang:v${version}-runtime \ + lmsysorg/sglang:v${version}-cu129-amd64-runtime \ + lmsysorg/sglang:v${version}-cu129-arm64-runtime + + # Create latest runtime manifest + docker buildx imagetools create \ + -t lmsysorg/sglang:latest-runtime \ + lmsysorg/sglang:v${version}-cu129-amd64-runtime \ + lmsysorg/sglang:v${version}-cu129-arm64-runtime + + # Create versioned CUDA 13 runtime manifest + docker buildx imagetools create \ + -t lmsysorg/sglang:v${version}-cu130-runtime \ + lmsysorg/sglang:v${version}-cu130-amd64-runtime \ + lmsysorg/sglang:v${version}-cu130-arm64-runtime + + # Create latest CUDA 13 runtime manifest + docker buildx imagetools create \ + -t lmsysorg/sglang:latest-cu130-runtime \ + lmsysorg/sglang:v${version}-cu130-amd64-runtime \ + lmsysorg/sglang:v${version}-cu130-arm64-runtime diff --git a/docker/Dockerfile b/docker/Dockerfile index c3d7a7d21cb..97f7feb47b1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,10 +1,11 @@ ARG CUDA_VERSION=12.9.1 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS base ARG TARGETARCH ARG BUILD_TYPE=all ARG BRANCH_TYPE=remote ARG GRACE_BLACKWELL=0 +ARG HOPPER_SBO=0 ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2 ARG HOPPER_SBO_DEEPEP_COMMIT=9f2fc4b3182a51044ae7ecb6610f7c9c3258c4d6 @@ -19,85 +20,162 @@ ARG UBUNTU_MIRROR ARG GITHUB_ARTIFACTORY=github.com ARG INSTALL_FLASHINFER_JIT_CACHE=0 ARG FLASHINFER_VERSION=0.5.3 -ARG NVSHMEM_VERSION=3.4.5 ENV DEBIAN_FRONTEND=noninteractive \ CUDA_HOME=/usr/local/cuda \ GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ \ FLASHINFER_VERSION=${FLASHINFER_VERSION} -# Add GKE default lib and bin locations. + +# Add GKE default lib and bin locations ENV PATH="${PATH}:/usr/local/nvidia/bin" \ LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" -# Replace Ubuntu sources if it is specified +# Replace Ubuntu sources if specified RUN if [ -n "$UBUNTU_MIRROR" ]; then \ sed -i "s|http://.*archive.ubuntu.com|$UBUNTU_MIRROR|g" /etc/apt/sources.list && \ sed -i "s|http://.*security.ubuntu.com|$UBUNTU_MIRROR|g" /etc/apt/sources.list; \ fi -RUN --mount=type=cache,target=/var/cache/apt apt update && apt install wget -y && apt install software-properties-common -y \ - && add-apt-repository ppa:deadsnakes/ppa -y \ - && apt install python3.12-full python3.12-dev python3.10-venv -y \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \ - && update-alternatives --set python3 /usr/bin/python3.12 \ - && wget https://bootstrap.pypa.io/get-pip.py \ - && python3 get-pip.py \ - # Fix for `apt-add-repository` - && cd /usr/lib/python3/dist-packages/ \ - && ln -s apt_pkg.cpython-310-*-linux-gnu.so apt_pkg.so - -# Set timezone and install all packages -RUN --mount=type=cache,target=/var/cache/apt echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ - && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ - && apt-get update && apt-get install -y --no-install-recommends \ +# Python setup (combined with apt update to reduce layers) +RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \ + apt update && apt install -y --no-install-recommends wget software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa -y \ + && apt install -y --no-install-recommends python3.12-full python3.12-dev python3.10-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \ + && update-alternatives --set python3 /usr/bin/python3.12 \ + && wget -q https://bootstrap.pypa.io/get-pip.py \ + && python3 get-pip.py --break-system-packages \ + && rm get-pip.py \ + # Allow pip to install packages globally (PEP 668 workaround for Ubuntu 24.04) + && python3 -m pip config set global.break-system-packages true \ + # Fix for apt-add-repository + && cd /usr/lib/python3/dist-packages/ \ + && ln -s apt_pkg.cpython-310-*-linux-gnu.so apt_pkg.so + +# Install system dependencies (organized by category for better caching) +RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \ + echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update && apt-get install -y --no-install-recommends \ + # Core system utilities tzdata \ - software-properties-common netcat-openbsd kmod unzip openssh-server \ - curl wget lsof zsh ccache tmux htop git-lfs tree \ - build-essential cmake perl \ - libopenmpi-dev libnuma1 libnuma-dev numactl \ - libibverbs-dev libibverbs1 libibumad3 \ - librdmacm1 libnl-3-200 libnl-route-3-200 libnl-route-3-dev libnl-3-dev \ - ibverbs-providers infiniband-diags perftest \ - libgoogle-glog-dev libgtest-dev libjsoncpp-dev libunwind-dev \ - libboost-all-dev libssl-dev \ - libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler protobuf-compiler-grpc \ + ca-certificates \ + software-properties-common \ + netcat-openbsd \ + kmod \ + unzip \ + openssh-server \ + curl \ + wget \ + lsof \ + locales \ + # Build essentials (needed for framework stage) + build-essential \ + cmake \ + perl \ + patchelf \ + ccache \ + git-lfs \ + # MPI and NUMA + libopenmpi-dev \ + libnuma1 \ + libnuma-dev \ + numactl \ + # InfiniBand/RDMA + libibverbs-dev \ + libibverbs1 \ + libibumad3 \ + librdmacm1 \ + libnl-3-200 \ + libnl-route-3-200 \ + libnl-route-3-dev \ + libnl-3-dev \ + ibverbs-providers \ + infiniband-diags \ + perftest \ + # Development libraries + libgoogle-glog-dev \ + libgtest-dev \ + libjsoncpp-dev \ + libunwind-dev \ + libboost-all-dev \ + libssl-dev \ + libgrpc-dev \ + libgrpc++-dev \ + libprotobuf-dev \ + protobuf-compiler \ + protobuf-compiler-grpc \ pybind11-dev \ - libhiredis-dev libcurl4-openssl-dev \ - libczmq4 libczmq-dev \ + libhiredis-dev \ + libcurl4-openssl-dev \ + libczmq4 \ + libczmq-dev \ libfabric-dev \ - patchelf \ - nvidia-dkms-550 \ - devscripts debhelper fakeroot dkms check libsubunit0 libsubunit-dev \ - && ln -sf /usr/bin/python3.12 /usr/bin/python \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean + # Package building tools + devscripts \ + debhelper \ + fakeroot \ + dkms \ + check \ + libsubunit0 \ + libsubunit-dev \ + && ln -sf /usr/bin/python3.12 /usr/bin/python \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean -# Replace pip global cache if it is specified +# Replace pip global cache if specified RUN if [ -n "${PIP_DEFAULT_INDEX}" ]; then \ python3 -m pip config set global.index-url ${PIP_DEFAULT_INDEX}; \ fi # GDRCopy installation RUN mkdir -p /tmp/gdrcopy && cd /tmp \ - && wget -q https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \ - && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \ - && cd gdrcopy-${GDRCOPY_VERSION}/packages \ - && CUDA=/usr/local/cuda ./build-deb-packages.sh \ - && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ - && cd / && rm -rf /tmp/gdrcopy + && curl --retry 3 --retry-delay 2 -fsSL -o v${GDRCOPY_VERSION}.tar.gz \ + https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \ + && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \ + && cd gdrcopy-${GDRCOPY_VERSION}/packages \ + && CUDA=/usr/local/cuda ./build-deb-packages.sh \ + && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \ + && cd / && rm -rf /tmp/gdrcopy # Fix DeepEP IBGDA symlink RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so +# Set up locale +RUN locale-gen en_US.UTF-8 +ENV LANG=en_US.UTF-8 \ + LANGUAGE=en_US:en \ + LC_ALL=en_US.UTF-8 + +######################################################## +########## Framework Development Image ################ +######################################################## + +# Copy local source if building from local FROM scratch AS local_src COPY . /src -FROM base AS build-image -# Install SGLang -# Until torch 2.9 and cu13 are stable we manually update torch if you are on CUDA 13 -WORKDIR /sgl-workspace +FROM base AS framework + ARG BRANCH_TYPE +ARG BUILD_TYPE +ARG CUDA_VERSION +ARG BUILD_AND_DOWNLOAD_PARALLEL +ARG SGL_KERNEL_VERSION +ARG SGL_VERSION +ARG USE_LATEST_SGLANG +ARG INSTALL_FLASHINFER_JIT_CACHE +ARG FLASHINFER_VERSION +ARG GRACE_BLACKWELL +ARG GRACE_BLACKWELL_DEEPEP_BRANCH +ARG DEEPEP_COMMIT +ARG TRITON_LANG_COMMIT +ARG GITHUB_ARTIFACTORY + +WORKDIR /sgl-workspace + +# Install SGLang COPY --from=local_src /src /tmp/local_src RUN if [ "$BRANCH_TYPE" = "local" ]; then \ cp -r /tmp/local_src /sgl-workspace/sglang; \ @@ -106,44 +184,43 @@ RUN if [ "$BRANCH_TYPE" = "local" ]; then \ else \ git clone --depth=1 --branch v${SGL_VERSION} https://github.com/sgl-project/sglang.git /sgl-workspace/sglang; \ fi \ - && rm -rf /tmp/local_src -RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --upgrade pip setuptools wheel html5lib six \ - && cd sglang \ - && case "$CUDA_VERSION" in \ - 12.6.1) CUINDEX=126 ;; \ - 12.8.1) CUINDEX=128 ;; \ - 12.9.1) CUINDEX=129 ;; \ - 13.0.1) CUINDEX=130 ;; \ - *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \ + && rm -rf /tmp/local_src + +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade pip setuptools wheel html5lib six \ + && cd sglang \ + && case "$CUDA_VERSION" in \ + 12.6.1) CUINDEX=126 ;; \ + 12.8.1) CUINDEX=128 ;; \ + 12.9.1) CUINDEX=129 ;; \ + 13.0.1) CUINDEX=130 ;; \ + *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \ esac \ - && if [ "$CUDA_VERSION" = "12.6.1" ]; then \ - python3 -m pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \ - ; \ - elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \ - python3 -m pip install sgl-kernel==${SGL_KERNEL_VERSION} \ - ; \ - elif [ "$CUDA_VERSION" = "13.0.1" ]; then \ - python3 -m pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \ - ; \ - else \ - echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \ - ; \ - fi \ - && python3 -m pip install -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \ - && if [ "$INSTALL_FLASHINFER_JIT_CACHE" = "1" ]; then \ - python3 -m pip install flashinfer-jit-cache==${FLASHINFER_VERSION} --index-url https://flashinfer.ai/whl/cu${CUINDEX} ; \ - fi \ - && FLASHINFER_CUBIN_DOWNLOAD_THREADS=${BUILD_AND_DOWNLOAD_PARALLEL} FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin + && if [ "$CUDA_VERSION" = "12.6.1" ]; then \ + python3 -m pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \ + ; \ + elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \ + python3 -m pip install sgl-kernel==${SGL_KERNEL_VERSION} \ + ; \ + elif [ "$CUDA_VERSION" = "13.0.1" ]; then \ + python3 -m pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \ + ; \ + else \ + echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \ + ; \ + fi \ + && python3 -m pip install -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \ + && if [ "$INSTALL_FLASHINFER_JIT_CACHE" = "1" ]; then \ + python3 -m pip install flashinfer-jit-cache==${FLASHINFER_VERSION} --index-url https://flashinfer.ai/whl/cu${CUINDEX} ; \ + fi \ + && FLASHINFER_CUBIN_DOWNLOAD_THREADS=${BUILD_AND_DOWNLOAD_PARALLEL} FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin +# DeepEP # We use Tom's DeepEP fork for GB200 for now; the 1fd57b0276311d035d16176bb0076426166e52f3 commit is https://github.com/fzyzcjy/DeepEP/tree/gb200_blog_part_2 +# TODO: move from Tom's branch to DeepEP hybrid-ep branch +# We use the nvshmem version that ships with torch 2.9.1 +# CU12 uses 3.3.20 and CU13 uses 3.3.24 RUN set -eux; \ - if [ "${CUDA_VERSION%%.*}" = "12" ]; then \ - pip install nvidia-nvshmem-cu12==3.3.20 ; \ - elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \ - pip install nvidia-nvshmem-cu13==${NVSHMEM_VERSION} ; \ - else \ - echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ; \ - fi && \ if [ "$GRACE_BLACKWELL" = "1" ]; then \ git clone https://github.com/fzyzcjy/DeepEP.git && \ cd DeepEP && \ @@ -157,33 +234,40 @@ RUN set -eux; \ sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ cd .. ; \ else \ - wget -q https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \ - unzip ${DEEPEP_COMMIT}.zip && rm ${DEEPEP_COMMIT}.zip && mv DeepEP-${DEEPEP_COMMIT} DeepEP && cd DeepEP && \ - sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ - cd .. ; \ + curl --retry 3 --retry-delay 2 -fsSL -o ${DEEPEP_COMMIT}.zip \ + https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \ + unzip -q ${DEEPEP_COMMIT}.zip && rm ${DEEPEP_COMMIT}.zip && mv DeepEP-${DEEPEP_COMMIT} DeepEP && cd DeepEP && \ + sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \ + cd .. ; \ fi # Install DeepEP -# CTK13 requires the cccl include -RUN --mount=type=cache,target=/root/.cache/pip cd /sgl-workspace/DeepEP && \ +RUN --mount=type=cache,target=/root/.cache/pip \ + cd /sgl-workspace/DeepEP && \ case "$CUDA_VERSION" in \ - 12.6.1) \ - CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \ - ;; \ - 12.8.1|12.9.1|13.0.1) \ - CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' \ - ;; \ - *) \ - echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \ - ;; \ + 12.6.1) \ + CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \ + ;; \ + 12.8.1) \ + # FIXED: 12.8.1 does NOT support Blackwell 10.3 \ + CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' \ + ;; \ + 12.9.1|13.0.1) \ + # 12.9.1+ properly supports Blackwell 10.3 \ + CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' \ + ;; \ + *) \ + echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \ + ;; \ esac && \ if [ "${CUDA_VERSION%%.*}" = "13" ]; then \ - sed -i "/^ include_dirs = \['csrc\/'\]/a\ include_dirs.append('${CUDA_HOME}/include/cccl')" setup.py; \ + sed -i "/^ include_dirs = \['csrc\/'\]/a\ include_dirs.append('${CUDA_HOME}/include/cccl')" setup.py; \ fi && \ TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} pip install --no-build-isolation . -# Python tools -RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install \ +# Install essential Python packages +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install \ datamodel_code_generator \ mooncake-transfer-engine==0.3.7.post2 \ pre-commit \ @@ -197,65 +281,77 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install \ nixl \ py-spy -# Some patching packages -# TODO: Remove this when torch version covers these packages -# Move cutlass-dsl to pyproject.toml after drivers on CI runners are updated +# Build and install sgl-router (install Rust, build, then remove to save space) +RUN --mount=type=cache,target=/root/.cache/pip \ + curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && export PATH="/root/.cargo/bin:${PATH}" \ + && rustc --version && cargo --version \ + && python3 -m pip install maturin \ + && cd /sgl-workspace/sglang/sgl-router/bindings/python \ + && ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \ + && python3 -m pip install --force-reinstall dist/*.whl \ + && cd /sgl-workspace/sglang/sgl-router \ + && cargo build --release --bin sglang-router --features vendored-openssl \ + && cp target/release/sglang-router /usr/local/bin/sglang-router \ + && rm -rf /root/.cargo /root/.rustup target dist ~/.cargo \ + && sed -i '/\.cargo\/env/d' /root/.profile /root/.bashrc 2>/dev/null || true + +# Patching packages for CUDA 12/13 compatibility +# TODO: Remove when torch version covers these packages +# TODO: Move cutlass-dsl to pyproject.toml after drivers on CI runners are updated RUN --mount=type=cache,target=/root/.cache/pip if [ "${CUDA_VERSION%%.*}" = "12" ]; then \ python3 -m pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \ - python3 -m pip install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall --no-deps; \ - python3 -m pip install nvidia-nvshmem-cu12==3.3.20 --force-reinstall --no-deps; \ - python3 -m pip install nvidia-cutlass-dsl==4.3.0 --force-reinstall --no-deps; \ + python3 -m pip install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall --no-deps ; \ + python3 -m pip install nvidia-cutlass-dsl==4.3.0 --force-reinstall --no-deps ; \ elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \ python3 -m pip install nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \ python3 -m pip install nvidia-cublas==13.1.0.3 --force-reinstall --no-deps ; \ python3 -m pip install nixl-cu13 ; \ - python3 -m pip install nvidia-nvshmem-cu13==${NVSHMEM_VERSION} --force-reinstall --no-deps; \ - python3 -m pip install nvidia-cutlass-dsl==4.3.1 --force-reinstall --no-deps; \ + python3 -m pip install nvidia-cutlass-dsl==4.3.1 --force-reinstall --no-deps ; \ fi -# Install development tools and utilities -RUN --mount=type=cache,target=/var/cache/apt apt-get update && apt-get install -y \ +# Install development tools +RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \ + apt-get update && apt-get install -y --no-install-recommends \ gdb \ ninja-build \ vim \ tmux \ htop \ - wget \ - curl \ - locales \ - lsof \ - git \ - git-lfs \ zsh \ tree \ silversearcher-ag \ cloc \ - unzip \ pkg-config \ - libssl-dev \ bear \ - ccache \ less \ + rdma-core \ + openssh-server \ gnuplot \ - && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \ + infiniband-diags \ + perftest \ + ibverbs-providers \ + libibumad3 \ + libibverbs1 \ + libnl-3-200 \ + libnl-route-3-200 \ + librdmacm1 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean -RUN --mount=type=cache,target=/var/cache/apt apt update -y \ +# Install NVIDIA development tools +RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \ + apt update -y \ && apt install -y --no-install-recommends gnupg \ && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \ && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \ && apt update -y \ - && apt install nsight-systems-cli -y - -# Set up locale -RUN locale-gen en_US.UTF-8 -ENV LANG=en_US.UTF-8 -ENV LANGUAGE=en_US:en -ENV LC_ALL=en_US.UTF-8 + && apt install -y --no-install-recommends nsight-systems-cli \ + && rm -rf /var/lib/apt/lists/* -# Install minimal Python packages -RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --break-system-packages \ +# Install minimal Python dev packages +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --break-system-packages \ pytest \ black \ isort \ @@ -268,66 +364,183 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --break-sy tabulate \ termplotlib -# Install diff-so-fancy -RUN curl -LSso /usr/local/bin/diff-so-fancy https://${GITHUB_ARTIFACTORY}/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \ +# diff-so-fancy +RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/diff-so-fancy \ + https://${GITHUB_ARTIFACTORY}/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \ && chmod +x /usr/local/bin/diff-so-fancy -# Install clang-format -RUN curl -LSso /usr/local/bin/clang-format https://${GITHUB_ARTIFACTORY}/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \ +# clang-format +RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format \ + https://${GITHUB_ARTIFACTORY}/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \ && chmod +x /usr/local/bin/clang-format -# Install clangd -RUN curl -L https://${GITHUB_ARTIFACTORY}/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \ - && unzip clangd.zip \ +# clangd +RUN curl --retry 3 --retry-delay 2 -fsSL -o clangd.zip \ + https://${GITHUB_ARTIFACTORY}/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip \ + && unzip -q clangd.zip \ && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \ && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \ && rm -rf clangd_18.1.3 clangd.zip -# Install CMake +# CMake RUN CMAKE_VERSION=3.31.1 \ && ARCH=$(uname -m) \ && CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \ - && wget -q "https://${GITHUB_ARTIFACTORY}/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \ + && curl --retry 3 --retry-delay 2 -fsSL -o "${CMAKE_INSTALLER}.tar.gz" \ + "https://${GITHUB_ARTIFACTORY}/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \ && tar -xzf "${CMAKE_INSTALLER}.tar.gz" \ && cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \ && cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \ && rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz" -# Build and install sgl-router (Rust toolchain removed after build to save space) -RUN --mount=type=cache,target=/root/.cache/pip curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ - && export PATH="/root/.cargo/bin:${PATH}" \ - && rustc --version && cargo --version \ - && python3 -m pip install maturin \ - && cd /sgl-workspace/sglang/sgl-router/bindings/python \ - && ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \ - && python3 -m pip install --force-reinstall dist/*.whl \ - && rm -rf /root/.cargo /root/.rustup target dist ~/.cargo \ - && sed -i '/\.cargo\/env/d' /root/.profile /root/.bashrc /root/.zshenv 2>/dev/null || true - +# Install just +RUN curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://just.systems/install.sh | \ + sed "s|https://github.com|https://${GITHUB_ARTIFACTORY}|g" | \ + bash -s -- --tag 1.42.4 --to /usr/local/bin # Add yank script COPY --chown=root:root --chmod=755 docker/configs/yank /usr/local/bin/yank # Install oh-my-zsh and plugins -RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \ - && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \ - && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting +RUN sh -c "$(curl --retry 3 --retry-delay 2 -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \ + && git clone --depth 1 https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \ + && git clone --depth 1 https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting -# Configure Vim and tmux +# Configure development environment COPY docker/configs/.vimrc /root/.vimrc COPY docker/configs/.tmux.conf /root/.tmux.conf - -# Configure Git COPY docker/configs/.gitconfig /tmp/.gitconfig RUN cat /tmp/.gitconfig >> /root/.gitconfig && rm /tmp/.gitconfig - -# Configure zsh COPY docker/configs/.zshrc /root/.zshrc -RUN set -euxo ; \ - curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | \ - sed "s|https://github.com|https://${GITHUB_ARTIFACTORY}|g" | \ - bash -s -- --tag 1.42.4 --to /usr/local/bin +# Fix Triton to use system ptxas for Blackwell (sm_103a) support (CUDA 13+ only) +RUN if [ "${CUDA_VERSION%%.*}" = "13" ] && [ -d /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin ]; then \ + rm -f /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas && \ + ln -s /usr/local/cuda/bin/ptxas /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas; \ + fi # Set workspace directory WORKDIR /sgl-workspace/sglang + +######################################################## +########## Runtime Image ############################## +######################################################## +# +# PURPOSE: Production runtime environment with JIT support +# +# This stage creates a production-ready image containing: +# - Pre-compiled SGLang and DeepEP components +# - Full CUDA toolchain for JIT compilation (DeepGEMM, Triton, FlashInfer) +# - Optimized for inference workloads and deployment +# - Smaller than framework (no dev tools like vim, tmux, nsight, etc.) +# +# Use this stage when you need: +# - Production deployment of SGLang +# - JIT compilation support for FP8/microscaling kernels +# - Ready-to-run inference server environment +# +# Note: Uses devel base for complete NVCC toolchain required by DeepGEMM JIT +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS runtime + +ARG CUDA_VERSION +ARG TARGETARCH +ARG GDRCOPY_VERSION=2.5.1 + +ENV DEBIAN_FRONTEND=noninteractive \ + CUDA_HOME=/usr/local/cuda \ + GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ + +# Add GKE default lib and bin locations + CUDA compiler paths for FlashInfer JIT +ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin" \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64" + +# Install runtime dependencies (devel base provides gcc/g++/build tools) +RUN --mount=type=cache,target=/var/cache/apt,id=runtime-apt \ + apt-get update && apt-get install -y --no-install-recommends \ + # Python runtime + software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa -y \ + && apt-get update && apt-get install -y --no-install-recommends --allow-change-held-packages \ + python3.12-full \ + python3.12-dev \ + wget \ + # Core system utilities + tzdata \ + ca-certificates \ + netcat-openbsd \ + curl \ + git \ + # Runtime libraries + libopenmpi3 \ + libnuma1 \ + libibverbs1 \ + libibumad3 \ + librdmacm1 \ + libnl-3-200 \ + libnl-route-3-200 \ + ibverbs-providers \ + libgoogle-glog0v6t64 \ + libunwind8 \ + libboost-system1.83.0 \ + libboost-thread1.83.0 \ + libboost-filesystem1.83.0 \ + libgrpc++1.51t64 \ + libprotobuf32t64 \ + libhiredis1.1.0 \ + libcurl4 \ + libczmq4 \ + libfabric1 \ + libssl3 \ + # RDMA runtime + rdma-core \ + infiniband-diags \ + perftest \ + # Build tools for JIT compilation + ninja-build \ + # NCCL packages needed for pynccl_allocator JIT compilation (-lnccl) + libnccl2 \ + libnccl-dev \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \ + && update-alternatives --set python3 /usr/bin/python3.12 \ + && ln -sf /usr/bin/python3.12 /usr/bin/python \ + && wget -q https://bootstrap.pypa.io/get-pip.py \ + && python3 get-pip.py --break-system-packages \ + && rm get-pip.py \ + # Allow pip to install packages globally (PEP 668 workaround for Ubuntu 24.04) + && python3 -m pip config set global.break-system-packages true \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Set up locale +RUN apt-get update && apt-get install -y --no-install-recommends locales \ + && locale-gen en_US.UTF-8 \ + && rm -rf /var/lib/apt/lists/* + +ENV LANG=en_US.UTF-8 \ + LANGUAGE=en_US:en \ + LC_ALL=en_US.UTF-8 + +# Copy Python site-packages from framework (contains all built packages) +COPY --from=framework /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages + +# Copy SGLang workspace +COPY --from=framework /sgl-workspace /sgl-workspace + +# Fix Triton to use system ptxas for Blackwell (sm_103a) support (CUDA 13+ only) +RUN if [ "${CUDA_VERSION%%.*}" = "13" ] && [ -d /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin ]; then \ + rm -f /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas && \ + ln -s /usr/local/cuda/bin/ptxas /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas; \ + fi + +# Copy GDRCopy runtime libraries (but not the build artifacts) +COPY --from=framework /usr/lib/libgdrapi.so* /usr/lib/ +COPY --from=framework /usr/bin/gdrcopy_* /usr/bin/ +COPY --from=framework /usr/src/gdrdrv-2.5.1 /usr/src/gdrdrv-2.5.1 + +# Fix DeepEP IBGDA symlink in runtime +RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so + +WORKDIR /sgl-workspace/sglang + +# Default command +CMD ["/bin/bash"] diff --git a/docs/get_started/install.md b/docs/get_started/install.md index a4911c4159a..913c3bcd77d 100644 --- a/docs/get_started/install.md +++ b/docs/get_started/install.md @@ -53,6 +53,19 @@ docker run --gpus all \ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ``` +For production deployments, use the `runtime` variant which is significantly smaller (~40% reduction) by excluding build tools and development dependencies: + +```bash +docker run --gpus all \ + --shm-size 32g \ + -p 30000:30000 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HF_TOKEN=" \ + --ipc=host \ + lmsysorg/sglang:latest-runtime \ + python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 +``` + You can also find the nightly docker images [here](https://hub.docker.com/r/lmsysorg/sglang/tags?name=nightly). ## Method 4: Using Kubernetes @@ -172,10 +185,10 @@ echo "Build and push completed successfully!" 3. Deploy a model for serving on AWS Sagemaker, refer to [deploy_and_serve_endpoint.py](https://github.com/sgl-project/sglang/blob/main/examples/sagemaker/deploy_and_serve_endpoint.py). For more information, check out [sagemaker-python-sdk](https://github.com/aws/sagemaker-python-sdk). - 1. By default, the model server on SageMaker will run with the following command: `python3 -m sglang.launch_server --model-path opt/ml/model --host 0.0.0.0 --port 8080`. This is optimal for hosting your own model with SageMaker. - 2. To modify your model serving parameters, the [serve](https://github.com/sgl-project/sglang/blob/main/docker/serve) script allows for all available options within `python3 -m sglang.launch_server --help` cli by specifying environment variables with prefix `SM_SGLANG_`. - 3. The serve script will automatically convert all environment variables with prefix `SM_SGLANG_` from `SM_SGLANG_INPUT_ARGUMENT` into `--input-argument` to be parsed into `python3 -m sglang.launch_server` cli. - 4. For example, to run [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) with reasoning parser, simply add additional environment variables `SM_SGLANG_MODEL_PATH=Qwen/Qwen3-0.6B` and `SM_SGLANG_REASONING_PARSER=qwen3`. + 1. By default, the model server on SageMaker will run with the following command: `python3 -m sglang.launch_server --model-path opt/ml/model --host 0.0.0.0 --port 8080`. This is optimal for hosting your own model with SageMaker. + 2. To modify your model serving parameters, the [serve](https://github.com/sgl-project/sglang/blob/main/docker/serve) script allows for all available options within `python3 -m sglang.launch_server --help` cli by specifying environment variables with prefix `SM_SGLANG_`. + 3. The serve script will automatically convert all environment variables with prefix `SM_SGLANG_` from `SM_SGLANG_INPUT_ARGUMENT` into `--input-argument` to be parsed into `python3 -m sglang.launch_server` cli. + 4. For example, to run [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) with reasoning parser, simply add additional environment variables `SM_SGLANG_MODEL_PATH=Qwen/Qwen3-0.6B` and `SM_SGLANG_REASONING_PARSER=qwen3`.