diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 596033854e7..a95e5ddf576 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -1,4 +1,15 @@
 name: Release Docker Images
+#
+# This workflow builds and publishes both framework and runtime Docker images:
+#
+# Framework images (full development environment):
+#   - lmsysorg/sglang:v{version}, lmsysorg/sglang:latest
+#   - lmsysorg/sglang:v{version}-cu129-{amd64,arm64}
+#
+# Runtime images (production-optimized, ~50% smaller):
+#   - lmsysorg/sglang:v{version}-runtime, lmsysorg/sglang:latest-runtime
+#   - lmsysorg/sglang:v{version}-cu129-{amd64,arm64}-runtime
+#
 on:
   push:
     branches:
@@ -45,12 +56,13 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Build and Push AMD64
+      - name: Build and Push AMD64 Framework
         run: |
           version=$(cat python/sglang/version.py | cut -d'"' -f2)
           tag=v${version}-cu129-amd64
 
           docker buildx build \
+            --target framework \
             --platform linux/amd64 \
             --push \
             -f docker/Dockerfile \
@@ -61,6 +73,40 @@ jobs:
             --no-cache \
             .
 
+      - name: Build and Push AMD64 Runtime
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu129-amd64-runtime
+
+          docker buildx build \
+            --target runtime \
+            --platform linux/amd64 \
+            --push \
+            -f docker/Dockerfile \
+            --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \
+            --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \
+            --build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \
+            -t lmsysorg/sglang:${tag} \
+            --no-cache \
+            .
+
+      - name: Build and Push AMD64 Runtime (CUDA 13)
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu130-amd64-runtime
+
+          docker buildx build \
+            --target runtime \
+            --platform linux/amd64 \
+            --push \
+            -f docker/Dockerfile \
+            --build-arg CUDA_VERSION=13.0.1 \
+            --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \
+            --build-arg GRACE_BLACKWELL=0 \
+            -t lmsysorg/sglang:${tag} \
+            --no-cache \
+            .
+
   publish-arm64:
     if: github.repository == 'sgl-project/sglang'
     environment: "prod"
@@ -87,12 +133,30 @@ jobs:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
-      - name: Build and Push ARM64
+      - name: Build and Push ARM64 Framework
         run: |
           version=$(cat python/sglang/version.py | cut -d'"' -f2)
           tag=v${version}-cu129-arm64
 
           docker buildx build \
+            --target framework \
+            --platform linux/arm64 \
+            --push \
+            -f docker/Dockerfile \
+            --build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \
+            --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \
+            --build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \
+            -t lmsysorg/sglang:${tag} \
+            --no-cache \
+            .
+
+      - name: Build and Push ARM64 Runtime
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu129-arm64-runtime
+
+          docker buildx build \
+            --target runtime \
             --platform linux/arm64 \
             --push \
             -f docker/Dockerfile \
@@ -103,6 +167,23 @@ jobs:
             --no-cache \
             .
 
+      - name: Build and Push ARM64 Runtime (CUDA 13)
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu130-arm64-runtime
+
+          docker buildx build \
+            --target runtime \
+            --platform linux/arm64 \
+            --push \
+            -f docker/Dockerfile \
+            --build-arg CUDA_VERSION=13.0.1 \
+            --build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \
+            --build-arg GRACE_BLACKWELL=1 \
+            -t lmsysorg/sglang:${tag} \
+            --no-cache \
+            .
+
   create-manifests:
     runs-on: ubuntu-22.04
     needs: [publish-x86, publish-arm64]
@@ -125,14 +206,38 @@ jobs:
         run: |
           version=$(cat python/sglang/version.py | cut -d'"' -f2)
 
-          # Create versioned manifest
+          # Create versioned framework manifest (default)
           docker buildx imagetools create \
             -t lmsysorg/sglang:v${version} \
             lmsysorg/sglang:v${version}-cu129-amd64 \
             lmsysorg/sglang:v${version}-cu129-arm64
 
-          # Create latest manifest
+          # Create latest framework manifest (default)
           docker buildx imagetools create \
             -t lmsysorg/sglang:latest \
             lmsysorg/sglang:v${version}-cu129-amd64 \
             lmsysorg/sglang:v${version}-cu129-arm64
+
+          # Create versioned runtime manifest
+          docker buildx imagetools create \
+            -t lmsysorg/sglang:v${version}-runtime \
+            lmsysorg/sglang:v${version}-cu129-amd64-runtime \
+            lmsysorg/sglang:v${version}-cu129-arm64-runtime
+
+          # Create latest runtime manifest
+          docker buildx imagetools create \
+            -t lmsysorg/sglang:latest-runtime \
+            lmsysorg/sglang:v${version}-cu129-amd64-runtime \
+            lmsysorg/sglang:v${version}-cu129-arm64-runtime
+
+          # Create versioned CUDA 13 runtime manifest
+          docker buildx imagetools create \
+            -t lmsysorg/sglang:v${version}-cu130-runtime \
+            lmsysorg/sglang:v${version}-cu130-amd64-runtime \
+            lmsysorg/sglang:v${version}-cu130-arm64-runtime
+
+          # Create latest CUDA 13 runtime manifest
+          docker buildx imagetools create \
+            -t lmsysorg/sglang:latest-cu130-runtime \
+            lmsysorg/sglang:v${version}-cu130-amd64-runtime \
+            lmsysorg/sglang:v${version}-cu130-arm64-runtime
diff --git a/docker/Dockerfile b/docker/Dockerfile
index c3d7a7d21cb..97f7feb47b1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,10 +1,11 @@
 ARG CUDA_VERSION=12.9.1
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS base
 
 ARG TARGETARCH
 ARG BUILD_TYPE=all
 ARG BRANCH_TYPE=remote
 ARG GRACE_BLACKWELL=0
+ARG HOPPER_SBO=0
 
 ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
 ARG HOPPER_SBO_DEEPEP_COMMIT=9f2fc4b3182a51044ae7ecb6610f7c9c3258c4d6
@@ -19,85 +20,162 @@ ARG UBUNTU_MIRROR
 ARG GITHUB_ARTIFACTORY=github.com
 ARG INSTALL_FLASHINFER_JIT_CACHE=0
 ARG FLASHINFER_VERSION=0.5.3
-ARG NVSHMEM_VERSION=3.4.5
 
 ENV DEBIAN_FRONTEND=noninteractive \
     CUDA_HOME=/usr/local/cuda \
     GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ \
     FLASHINFER_VERSION=${FLASHINFER_VERSION}
-# Add GKE default lib and bin locations.
+
+# Add GKE default lib and bin locations
 ENV PATH="${PATH}:/usr/local/nvidia/bin" \
     LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
 
-# Replace Ubuntu sources if it is specified
+# Replace Ubuntu sources if specified
 RUN if [ -n "$UBUNTU_MIRROR" ]; then \
     sed -i "s|http://.*archive.ubuntu.com|$UBUNTU_MIRROR|g" /etc/apt/sources.list && \
     sed -i "s|http://.*security.ubuntu.com|$UBUNTU_MIRROR|g" /etc/apt/sources.list; \
 fi
 
-RUN --mount=type=cache,target=/var/cache/apt apt update && apt install wget -y && apt install software-properties-common -y \
- && add-apt-repository ppa:deadsnakes/ppa -y \
- && apt install python3.12-full python3.12-dev python3.10-venv -y \
- && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \
- && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \
- && update-alternatives --set python3 /usr/bin/python3.12 \
- && wget https://bootstrap.pypa.io/get-pip.py \
- && python3 get-pip.py \
- # Fix for `apt-add-repository`
- && cd /usr/lib/python3/dist-packages/ \
- && ln -s apt_pkg.cpython-310-*-linux-gnu.so apt_pkg.so
-
-# Set timezone and install all packages
-RUN --mount=type=cache,target=/var/cache/apt echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
- && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
- && apt-get update && apt-get install -y --no-install-recommends \
+# Python setup (combined with apt update to reduce layers)
+RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \
+    apt update && apt install -y --no-install-recommends wget software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa -y \
+    && apt install -y --no-install-recommends python3.12-full python3.12-dev python3.10-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \
+    && update-alternatives --set python3 /usr/bin/python3.12 \
+    && wget -q https://bootstrap.pypa.io/get-pip.py \
+    && python3 get-pip.py --break-system-packages \
+    && rm get-pip.py \
+    # Allow pip to install packages globally (PEP 668 workaround for Ubuntu 24.04)
+    && python3 -m pip config set global.break-system-packages true \
+    # Fix for apt-add-repository
+    && cd /usr/lib/python3/dist-packages/ \
+    && ln -s apt_pkg.cpython-310-*-linux-gnu.so apt_pkg.so
+
+# Install system dependencies (organized by category for better caching)
+RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \
+    echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update && apt-get install -y --no-install-recommends \
+    # Core system utilities
     tzdata \
-    software-properties-common netcat-openbsd kmod unzip openssh-server \
-    curl wget lsof zsh ccache tmux htop git-lfs tree \
-    build-essential cmake perl \
-    libopenmpi-dev libnuma1 libnuma-dev numactl \
-    libibverbs-dev libibverbs1 libibumad3 \
-    librdmacm1 libnl-3-200 libnl-route-3-200 libnl-route-3-dev libnl-3-dev \
-    ibverbs-providers infiniband-diags perftest \
-    libgoogle-glog-dev libgtest-dev libjsoncpp-dev libunwind-dev \
-    libboost-all-dev libssl-dev \
-    libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler protobuf-compiler-grpc \
+    ca-certificates \
+    software-properties-common \
+    netcat-openbsd \
+    kmod \
+    unzip \
+    openssh-server \
+    curl \
+    wget \
+    lsof \
+    locales \
+    # Build essentials (needed for framework stage)
+    build-essential \
+    cmake \
+    perl \
+    patchelf \
+    ccache \
+    git-lfs \
+    # MPI and NUMA
+    libopenmpi-dev \
+    libnuma1 \
+    libnuma-dev \
+    numactl \
+    # InfiniBand/RDMA
+    libibverbs-dev \
+    libibverbs1 \
+    libibumad3 \
+    librdmacm1 \
+    libnl-3-200 \
+    libnl-route-3-200 \
+    libnl-route-3-dev \
+    libnl-3-dev \
+    ibverbs-providers \
+    infiniband-diags \
+    perftest \
+    # Development libraries
+    libgoogle-glog-dev \
+    libgtest-dev \
+    libjsoncpp-dev \
+    libunwind-dev \
+    libboost-all-dev \
+    libssl-dev \
+    libgrpc-dev \
+    libgrpc++-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    protobuf-compiler-grpc \
     pybind11-dev \
-    libhiredis-dev libcurl4-openssl-dev \
-    libczmq4 libczmq-dev \
+    libhiredis-dev \
+    libcurl4-openssl-dev \
+    libczmq4 \
+    libczmq-dev \
     libfabric-dev \
-    patchelf \
-    nvidia-dkms-550 \
-    devscripts debhelper fakeroot dkms check libsubunit0 libsubunit-dev \
- && ln -sf /usr/bin/python3.12 /usr/bin/python \
- && rm -rf /var/lib/apt/lists/* \
- && apt-get clean
+    # Package building tools
+    devscripts \
+    debhelper \
+    fakeroot \
+    dkms \
+    check \
+    libsubunit0 \
+    libsubunit-dev \
+    && ln -sf /usr/bin/python3.12 /usr/bin/python \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
 
-# Replace pip global cache if it is specified
+# Replace pip global cache if specified
 RUN if [ -n "${PIP_DEFAULT_INDEX}" ]; then \
     python3 -m pip config set global.index-url ${PIP_DEFAULT_INDEX}; \
 fi
 
 # GDRCopy installation
 RUN mkdir -p /tmp/gdrcopy && cd /tmp \
- && wget -q https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
- && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
- && cd gdrcopy-${GDRCOPY_VERSION}/packages \
- && CUDA=/usr/local/cuda ./build-deb-packages.sh \
- && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
- && cd / && rm -rf /tmp/gdrcopy
+    && curl --retry 3 --retry-delay 2 -fsSL -o v${GDRCOPY_VERSION}.tar.gz \
+        https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
+    && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
+    && cd gdrcopy-${GDRCOPY_VERSION}/packages \
+    && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+    && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+    && cd / && rm -rf /tmp/gdrcopy
 
 # Fix DeepEP IBGDA symlink
 RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
 
+# Set up locale
+RUN locale-gen en_US.UTF-8
+ENV LANG=en_US.UTF-8 \
+    LANGUAGE=en_US:en \
+    LC_ALL=en_US.UTF-8
+
+########################################################
+########## Framework Development Image ################
+########################################################
+
+# Copy local source if building from local
 FROM scratch AS local_src
 COPY . /src
 
-FROM base AS build-image
-# Install SGLang
-# Until torch 2.9 and cu13 are stable we manually update torch if you are on CUDA 13
-WORKDIR /sgl-workspace
+FROM base AS framework
+
 ARG BRANCH_TYPE
+ARG BUILD_TYPE
+ARG CUDA_VERSION
+ARG BUILD_AND_DOWNLOAD_PARALLEL
+ARG SGL_KERNEL_VERSION
+ARG SGL_VERSION
+ARG USE_LATEST_SGLANG
+ARG INSTALL_FLASHINFER_JIT_CACHE
+ARG FLASHINFER_VERSION
+ARG GRACE_BLACKWELL
+ARG GRACE_BLACKWELL_DEEPEP_BRANCH
+ARG DEEPEP_COMMIT
+ARG TRITON_LANG_COMMIT
+ARG GITHUB_ARTIFACTORY
+
+WORKDIR /sgl-workspace
+
+# Install SGLang
 COPY --from=local_src /src /tmp/local_src
 RUN if [ "$BRANCH_TYPE" = "local" ]; then \
         cp -r /tmp/local_src /sgl-workspace/sglang; \
@@ -106,44 +184,43 @@ RUN if [ "$BRANCH_TYPE" = "local" ]; then \
     else \
         git clone --depth=1 --branch v${SGL_VERSION} https://github.com/sgl-project/sglang.git /sgl-workspace/sglang; \
     fi \
- && rm -rf /tmp/local_src
-RUN --mount=type=cache,target=/root/.cache/pip  python3 -m pip install --upgrade pip setuptools wheel html5lib six \
- && cd sglang \
- && case "$CUDA_VERSION" in \
-      12.6.1) CUINDEX=126 ;; \
-      12.8.1) CUINDEX=128 ;; \
-      12.9.1) CUINDEX=129 ;; \
-      13.0.1) CUINDEX=130 ;; \
-      *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
+    && rm -rf /tmp/local_src
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+    && cd sglang \
+    && case "$CUDA_VERSION" in \
+        12.6.1) CUINDEX=126 ;; \
+        12.8.1) CUINDEX=128 ;; \
+        12.9.1) CUINDEX=129 ;; \
+        13.0.1) CUINDEX=130 ;; \
+        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
     esac \
- && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
-      python3 -m pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
-   ; \
-   elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
-      python3 -m pip install sgl-kernel==${SGL_KERNEL_VERSION} \
-   ; \
-   elif [ "$CUDA_VERSION" = "13.0.1" ]; then \
-      python3 -m pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
-   ; \
-   else \
-      echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
-   ; \
-   fi \
- && python3 -m pip install -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
- && if [ "$INSTALL_FLASHINFER_JIT_CACHE" = "1" ]; then \
-      python3 -m pip install flashinfer-jit-cache==${FLASHINFER_VERSION} --index-url https://flashinfer.ai/whl/cu${CUINDEX} ; \
-   fi \
- && FLASHINFER_CUBIN_DOWNLOAD_THREADS=${BUILD_AND_DOWNLOAD_PARALLEL} FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin
+    && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
+        python3 -m pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
+    ; \
+    elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
+        python3 -m pip install sgl-kernel==${SGL_KERNEL_VERSION} \
+    ; \
+    elif [ "$CUDA_VERSION" = "13.0.1" ]; then \
+        python3 -m pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
+    ; \
+    else \
+        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
+    ; \
+    fi \
+    && python3 -m pip install -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
+    && if [ "$INSTALL_FLASHINFER_JIT_CACHE" = "1" ]; then \
+        python3 -m pip install flashinfer-jit-cache==${FLASHINFER_VERSION} --index-url https://flashinfer.ai/whl/cu${CUINDEX} ; \
+    fi \
+    && FLASHINFER_CUBIN_DOWNLOAD_THREADS=${BUILD_AND_DOWNLOAD_PARALLEL} FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin
 
+# DeepEP
 # We use Tom's DeepEP fork for GB200 for now; the 1fd57b0276311d035d16176bb0076426166e52f3 commit is https://github.com/fzyzcjy/DeepEP/tree/gb200_blog_part_2
+# TODO: move from Tom's branch to DeepEP hybrid-ep branch
+# We use the nvshmem version that ships with torch 2.9.1
+# CU12 uses 3.3.20 and CU13 uses 3.3.24
 RUN set -eux; \
-    if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
-      pip install nvidia-nvshmem-cu12==3.3.20 ; \
-    elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
-      pip install nvidia-nvshmem-cu13==${NVSHMEM_VERSION} ; \
-    else \
-      echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ; \
-    fi && \
     if [ "$GRACE_BLACKWELL" = "1" ]; then \
       git clone https://github.com/fzyzcjy/DeepEP.git && \
       cd DeepEP && \
@@ -157,33 +234,40 @@ RUN set -eux; \
       sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
       cd .. ; \
     else \
-      wget -q https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \
-      unzip ${DEEPEP_COMMIT}.zip && rm ${DEEPEP_COMMIT}.zip && mv DeepEP-${DEEPEP_COMMIT} DeepEP && cd DeepEP && \
-      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
-      cd .. ; \
+        curl --retry 3 --retry-delay 2 -fsSL -o ${DEEPEP_COMMIT}.zip \
+            https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \
+        unzip -q ${DEEPEP_COMMIT}.zip && rm ${DEEPEP_COMMIT}.zip && mv DeepEP-${DEEPEP_COMMIT} DeepEP && cd DeepEP && \
+        sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
+        cd .. ; \
     fi
 
 # Install DeepEP
-# CTK13 requires the cccl include
-RUN --mount=type=cache,target=/root/.cache/pip cd /sgl-workspace/DeepEP && \
+RUN --mount=type=cache,target=/root/.cache/pip \
+    cd /sgl-workspace/DeepEP && \
     case "$CUDA_VERSION" in \
-      12.6.1) \
-        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \
-        ;; \
-      12.8.1|12.9.1|13.0.1) \
-        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' \
-        ;; \
-      *) \
-        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
-        ;; \
+        12.6.1) \
+            CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \
+            ;; \
+        12.8.1) \
+            # FIXED: 12.8.1 does NOT support Blackwell 10.3 \
+            CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' \
+            ;; \
+        12.9.1|13.0.1) \
+            # 12.9.1+ properly supports Blackwell 10.3 \
+            CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' \
+            ;; \
+        *) \
+            echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
+            ;; \
     esac && \
     if [ "${CUDA_VERSION%%.*}" = "13" ]; then \
-      sed -i "/^    include_dirs = \['csrc\/'\]/a\    include_dirs.append('${CUDA_HOME}/include/cccl')" setup.py; \
+        sed -i "/^    include_dirs = \['csrc\/'\]/a\    include_dirs.append('${CUDA_HOME}/include/cccl')" setup.py; \
     fi && \
     TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} pip install --no-build-isolation .
 
-# Python tools
-RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install \
+# Install essential Python packages
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install \
     datamodel_code_generator \
     mooncake-transfer-engine==0.3.7.post2 \
     pre-commit \
@@ -197,65 +281,77 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install \
     nixl \
     py-spy
 
-# Some patching packages
-# TODO: Remove this when torch version covers these packages
-# Move cutlass-dsl to pyproject.toml after drivers on CI runners are updated
+# Build and install sgl-router (install Rust, build, then remove to save space)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && export PATH="/root/.cargo/bin:${PATH}" \
+    && rustc --version && cargo --version \
+    && python3 -m pip install maturin \
+    && cd /sgl-workspace/sglang/sgl-router/bindings/python \
+    && ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \
+    && python3 -m pip install --force-reinstall dist/*.whl \
+    && cd /sgl-workspace/sglang/sgl-router \
+    && cargo build --release --bin sglang-router --features vendored-openssl \
+    && cp target/release/sglang-router /usr/local/bin/sglang-router \
+    && rm -rf /root/.cargo /root/.rustup target dist ~/.cargo \
+    && sed -i '/\.cargo\/env/d' /root/.profile /root/.bashrc 2>/dev/null || true
+
+# Patching packages for CUDA 12/13 compatibility
+# TODO: Remove when torch version covers these packages
+# TODO: Move cutlass-dsl to pyproject.toml after drivers on CI runners are updated
 RUN --mount=type=cache,target=/root/.cache/pip if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
     python3 -m pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
-    python3 -m pip install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall --no-deps; \
-    python3 -m pip install nvidia-nvshmem-cu12==3.3.20 --force-reinstall --no-deps; \
-    python3 -m pip install nvidia-cutlass-dsl==4.3.0 --force-reinstall --no-deps; \
+    python3 -m pip install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall --no-deps ; \
+    python3 -m pip install nvidia-cutlass-dsl==4.3.0 --force-reinstall --no-deps ; \
 elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
     python3 -m pip install nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \
     python3 -m pip install nvidia-cublas==13.1.0.3 --force-reinstall --no-deps ; \
     python3 -m pip install nixl-cu13 ; \
-    python3 -m pip install nvidia-nvshmem-cu13==${NVSHMEM_VERSION} --force-reinstall --no-deps; \
-    python3 -m pip install nvidia-cutlass-dsl==4.3.1 --force-reinstall --no-deps; \
+    python3 -m pip install nvidia-cutlass-dsl==4.3.1 --force-reinstall --no-deps ; \
 fi
 
-# Install development tools and utilities
-RUN --mount=type=cache,target=/var/cache/apt apt-get update && apt-get install -y \
+# Install development tools
+RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \
+    apt-get update && apt-get install -y --no-install-recommends \
     gdb \
     ninja-build \
     vim \
     tmux \
     htop \
-    wget \
-    curl \
-    locales \
-    lsof \
-    git \
-    git-lfs \
     zsh \
     tree \
     silversearcher-ag \
     cloc \
-    unzip \
     pkg-config \
-    libssl-dev \
     bear \
-    ccache \
     less \
+    rdma-core \
+    openssh-server \
     gnuplot \
-    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
+    infiniband-diags \
+    perftest \
+    ibverbs-providers \
+    libibumad3 \
+    libibverbs1 \
+    libnl-3-200 \
+    libnl-route-3-200 \
+    librdmacm1 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
-RUN --mount=type=cache,target=/var/cache/apt apt update -y \
+# Install NVIDIA development tools
+RUN --mount=type=cache,target=/var/cache/apt,id=framework-apt \
+    apt update -y \
     && apt install -y --no-install-recommends gnupg \
     && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
     && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \
     && apt update -y \
-    && apt install nsight-systems-cli -y
-
-# Set up locale
-RUN locale-gen en_US.UTF-8
-ENV LANG=en_US.UTF-8
-ENV LANGUAGE=en_US:en
-ENV LC_ALL=en_US.UTF-8
+    && apt install -y --no-install-recommends nsight-systems-cli \
+    && rm -rf /var/lib/apt/lists/*
 
-# Install minimal Python packages
-RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --break-system-packages \
+# Install minimal Python dev packages
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --break-system-packages \
     pytest \
     black \
     isort \
@@ -268,66 +364,183 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --break-sy
     tabulate \
     termplotlib
 
-# Install diff-so-fancy
-RUN curl -LSso /usr/local/bin/diff-so-fancy https://${GITHUB_ARTIFACTORY}/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
+# diff-so-fancy
+RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/diff-so-fancy \
+        https://${GITHUB_ARTIFACTORY}/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
     && chmod +x /usr/local/bin/diff-so-fancy
 
-# Install clang-format
-RUN curl -LSso /usr/local/bin/clang-format https://${GITHUB_ARTIFACTORY}/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
+# clang-format
+RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format \
+        https://${GITHUB_ARTIFACTORY}/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
     && chmod +x /usr/local/bin/clang-format
 
-# Install clangd
-RUN curl -L https://${GITHUB_ARTIFACTORY}/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
-    && unzip clangd.zip \
+# clangd
+RUN curl --retry 3 --retry-delay 2 -fsSL -o clangd.zip \
+        https://${GITHUB_ARTIFACTORY}/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip \
+    && unzip -q clangd.zip \
     && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \
     && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \
     && rm -rf clangd_18.1.3 clangd.zip
 
-# Install CMake
+# CMake
 RUN CMAKE_VERSION=3.31.1 \
     && ARCH=$(uname -m) \
     && CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \
-    && wget -q "https://${GITHUB_ARTIFACTORY}/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
+    && curl --retry 3 --retry-delay 2 -fsSL -o "${CMAKE_INSTALLER}.tar.gz" \
+        "https://${GITHUB_ARTIFACTORY}/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
     && tar -xzf "${CMAKE_INSTALLER}.tar.gz" \
     && cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \
     && cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \
     && rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz"
 
-# Build and install sgl-router (Rust toolchain removed after build to save space)
-RUN --mount=type=cache,target=/root/.cache/pip curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
-    && export PATH="/root/.cargo/bin:${PATH}" \
-    && rustc --version && cargo --version \
-    && python3 -m pip install maturin \
-    && cd /sgl-workspace/sglang/sgl-router/bindings/python \
-    && ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \
-    && python3 -m pip install --force-reinstall dist/*.whl \
-    && rm -rf /root/.cargo /root/.rustup target dist ~/.cargo \
-    && sed -i '/\.cargo\/env/d' /root/.profile /root/.bashrc /root/.zshenv 2>/dev/null || true
-
+# Install just
+RUN curl --proto '=https' --tlsv1.2 --retry 3 --retry-delay 2 -sSf https://just.systems/install.sh | \
+    sed "s|https://github.com|https://${GITHUB_ARTIFACTORY}|g" | \
+    bash -s -- --tag 1.42.4 --to /usr/local/bin
 
 # Add yank script
 COPY --chown=root:root --chmod=755 docker/configs/yank /usr/local/bin/yank
 
 # Install oh-my-zsh and plugins
-RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
-    && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
-    && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting
+RUN sh -c "$(curl --retry 3 --retry-delay 2 -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
+    && git clone --depth 1 https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
+    && git clone --depth 1 https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting
 
-# Configure Vim and tmux
+# Configure development environment
 COPY docker/configs/.vimrc /root/.vimrc
 COPY docker/configs/.tmux.conf /root/.tmux.conf
-
-# Configure Git
 COPY docker/configs/.gitconfig /tmp/.gitconfig
 RUN cat /tmp/.gitconfig >> /root/.gitconfig && rm /tmp/.gitconfig
-
-# Configure zsh
 COPY docker/configs/.zshrc /root/.zshrc
 
-RUN set -euxo ; \
-    curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | \
-    sed "s|https://github.com|https://${GITHUB_ARTIFACTORY}|g" | \
-    bash -s -- --tag 1.42.4 --to /usr/local/bin
+# Fix Triton to use system ptxas for Blackwell (sm_103a) support (CUDA 13+ only)
+RUN if [ "${CUDA_VERSION%%.*}" = "13" ] && [ -d /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin ]; then \
+        rm -f /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas && \
+        ln -s /usr/local/cuda/bin/ptxas /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas; \
+    fi
 
 # Set workspace directory
 WORKDIR /sgl-workspace/sglang
+
+########################################################
+########## Runtime Image ##############################
+########################################################
+#
+# PURPOSE: Production runtime environment with JIT support
+#
+# This stage creates a production-ready image containing:
+# - Pre-compiled SGLang and DeepEP components
+# - Full CUDA toolchain for JIT compilation (DeepGEMM, Triton, FlashInfer)
+# - Optimized for inference workloads and deployment
+# - Smaller than framework (no dev tools like vim, tmux, nsight, etc.)
+#
+# Use this stage when you need:
+# - Production deployment of SGLang
+# - JIT compilation support for FP8/microscaling kernels
+# - Ready-to-run inference server environment
+#
+# Note: Uses devel base for complete NVCC toolchain required by DeepGEMM JIT
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS runtime
+
+ARG CUDA_VERSION
+ARG TARGETARCH
+ARG GDRCOPY_VERSION=2.5.1
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/
+
+# Add GKE default lib and bin locations + CUDA compiler paths for FlashInfer JIT
+ENV PATH="${PATH}:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/cuda/nvvm/bin" \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
+
+# Install runtime dependencies (devel base provides gcc/g++/build tools)
+RUN --mount=type=cache,target=/var/cache/apt,id=runtime-apt \
+    apt-get update && apt-get install -y --no-install-recommends \
+    # Python runtime
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa -y \
+    && apt-get update && apt-get install -y --no-install-recommends --allow-change-held-packages \
+    python3.12-full \
+    python3.12-dev \
+    wget \
+    # Core system utilities
+    tzdata \
+    ca-certificates \
+    netcat-openbsd \
+    curl \
+    git \
+    # Runtime libraries
+    libopenmpi3 \
+    libnuma1 \
+    libibverbs1 \
+    libibumad3 \
+    librdmacm1 \
+    libnl-3-200 \
+    libnl-route-3-200 \
+    ibverbs-providers \
+    libgoogle-glog0v6t64 \
+    libunwind8 \
+    libboost-system1.83.0 \
+    libboost-thread1.83.0 \
+    libboost-filesystem1.83.0 \
+    libgrpc++1.51t64 \
+    libprotobuf32t64 \
+    libhiredis1.1.0 \
+    libcurl4 \
+    libczmq4 \
+    libfabric1 \
+    libssl3 \
+    # RDMA runtime
+    rdma-core \
+    infiniband-diags \
+    perftest \
+    # Build tools for JIT compilation
+    ninja-build \
+    # NCCL packages needed for pynccl_allocator JIT compilation (-lnccl)
+    libnccl2 \
+    libnccl-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \
+    && update-alternatives --set python3 /usr/bin/python3.12 \
+    && ln -sf /usr/bin/python3.12 /usr/bin/python \
+    && wget -q https://bootstrap.pypa.io/get-pip.py \
+    && python3 get-pip.py --break-system-packages \
+    && rm get-pip.py \
+    # Allow pip to install packages globally (PEP 668 workaround for Ubuntu 24.04)
+    && python3 -m pip config set global.break-system-packages true \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Set up locale
+RUN apt-get update && apt-get install -y --no-install-recommends locales \
+    && locale-gen en_US.UTF-8 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV LANG=en_US.UTF-8 \
+    LANGUAGE=en_US:en \
+    LC_ALL=en_US.UTF-8
+
+# Copy Python site-packages from framework (contains all built packages)
+COPY --from=framework /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages
+
+# Copy SGLang workspace
+COPY --from=framework /sgl-workspace /sgl-workspace
+
+# Fix Triton to use system ptxas for Blackwell (sm_103a) support (CUDA 13+ only)
+RUN if [ "${CUDA_VERSION%%.*}" = "13" ] && [ -d /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin ]; then \
+        rm -f /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas && \
+        ln -s /usr/local/cuda/bin/ptxas /usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/bin/ptxas; \
+    fi
+
+# Copy GDRCopy runtime libraries (but not the build artifacts)
+COPY --from=framework /usr/lib/libgdrapi.so* /usr/lib/
+COPY --from=framework /usr/bin/gdrcopy_* /usr/bin/
+COPY --from=framework /usr/src/gdrdrv-2.5.1 /usr/src/gdrdrv-2.5.1
+
+# Fix DeepEP IBGDA symlink in runtime
+RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
+
+WORKDIR /sgl-workspace/sglang
+
+# Default command
+CMD ["/bin/bash"]
diff --git a/docs/get_started/install.md b/docs/get_started/install.md
index a4911c4159a..913c3bcd77d 100644
--- a/docs/get_started/install.md
+++ b/docs/get_started/install.md
@@ -53,6 +53,19 @@ docker run --gpus all \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 
+For production deployments, use the `runtime` variant which is significantly smaller (~40% reduction) by excluding build tools and development dependencies:
+
+```bash
+docker run --gpus all \
+    --shm-size 32g \
+    -p 30000:30000 \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=<secret>" \
+    --ipc=host \
+    lmsysorg/sglang:latest-runtime \
+    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+```
+
 You can also find the nightly docker images [here](https://hub.docker.com/r/lmsysorg/sglang/tags?name=nightly).
 
 ## Method 4: Using Kubernetes
@@ -172,10 +185,10 @@ echo "Build and push completed successfully!"
 </details>
 
 3. Deploy a model for serving on AWS Sagemaker, refer to [deploy_and_serve_endpoint.py](https://github.com/sgl-project/sglang/blob/main/examples/sagemaker/deploy_and_serve_endpoint.py). For more information, check out [sagemaker-python-sdk](https://github.com/aws/sagemaker-python-sdk).
-    1. By default, the model server on SageMaker will run with the following command: `python3 -m sglang.launch_server --model-path opt/ml/model --host 0.0.0.0 --port 8080`. This is optimal for hosting your own model with SageMaker.
-    2. To modify your model serving parameters, the [serve](https://github.com/sgl-project/sglang/blob/main/docker/serve) script allows for all available options within `python3 -m sglang.launch_server --help` cli by specifying environment variables with prefix `SM_SGLANG_`.
-    3. The serve script will automatically convert all environment variables with prefix `SM_SGLANG_` from `SM_SGLANG_INPUT_ARGUMENT` into `--input-argument` to be parsed into `python3 -m sglang.launch_server` cli.
-    4. For example, to run [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) with reasoning parser, simply add additional environment variables `SM_SGLANG_MODEL_PATH=Qwen/Qwen3-0.6B` and `SM_SGLANG_REASONING_PARSER=qwen3`.
+   1. By default, the model server on SageMaker will run with the following command: `python3 -m sglang.launch_server --model-path opt/ml/model --host 0.0.0.0 --port 8080`. This is optimal for hosting your own model with SageMaker.
+   2. To modify your model serving parameters, the [serve](https://github.com/sgl-project/sglang/blob/main/docker/serve) script allows for all available options within `python3 -m sglang.launch_server --help` cli by specifying environment variables with prefix `SM_SGLANG_`.
+   3. The serve script will automatically convert all environment variables with prefix `SM_SGLANG_` from `SM_SGLANG_INPUT_ARGUMENT` into `--input-argument` to be parsed into `python3 -m sglang.launch_server` cli.
+   4. For example, to run [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) with reasoning parser, simply add additional environment variables `SM_SGLANG_MODEL_PATH=Qwen/Qwen3-0.6B` and `SM_SGLANG_REASONING_PARSER=qwen3`.
 
 </details>