diff --git a/README.md b/README.md index 11e21d50504..e5ef6954feb 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ It is recommended to use [NGC PyTorch Container](https://catalog.ngc.nvidia.com/ > [!Note] > Ensure that you select a PyTorch container image version that matches the version of TensorRT-LLM you are using. -> For example, if you are using `tensorrt-llm==1.0.0rc6`, use the PyTorch container image version `25.06`. +> For example, if you are using `tensorrt-llm==1.1.0rc3`, use the PyTorch container image version `25.06`. > To find the correct PyTorch container version for your desired `tensorrt-llm` release, visit the [TensorRT-LLM Dockerfile.multi](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/Dockerfile.multi) on GitHub. Switch to the branch that matches your `tensorrt-llm` version, and look for the `BASE_TAG` line to identify the recommended PyTorch container tag. > [!Important] diff --git a/components/backends/trtllm/engine_configs/decode.yaml b/components/backends/trtllm/engine_configs/decode.yaml index 3cf5476e1ed..a0154bb6e31 100644 --- a/components/backends/trtllm/engine_configs/decode.yaml +++ b/components/backends/trtllm/engine_configs/decode.yaml @@ -28,4 +28,4 @@ kv_cache_config: free_gpu_memory_fraction: 0.85 cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml index 59b9aabe984..8f0bd83919b 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml @@ -54,4 +54,4 @@ cuda_graph_config: print_iter_log: true cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml index f44bcac1417..46494e8d68d 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml @@ -38,4 +38,4 @@ speculative_config: num_nextn_predict_layers: 1 cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml index 73e193c146a..28f246574be 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml @@ -57,4 +57,4 @@ cuda_graph_config: print_iter_log: true cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml index 3d6d4d35740..13b2410a672 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml @@ -36,4 +36,4 @@ disable_overlap_scheduler: true print_iter_log: true cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml index 652cf82250a..8f953c6472b 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml @@ -63,4 +63,4 @@ cuda_graph_config: print_iter_log: true cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml index 4f7aabe6824..8a756cc32b0 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml +++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml @@ -41,4 +41,4 @@ disable_overlap_scheduler: true print_iter_log: true cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/encode.yaml b/components/backends/trtllm/engine_configs/encode.yaml index 5ac1f884108..6f0c20990f5 100644 --- a/components/backends/trtllm/engine_configs/encode.yaml +++ b/components/backends/trtllm/engine_configs/encode.yaml @@ -27,4 +27,4 @@ kv_cache_config: free_gpu_memory_fraction: 0.85 cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml b/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml index f417ed6f0aa..c3ea683857d 100644 --- a/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml +++ b/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml @@ -26,4 +26,4 @@ kv_cache_config: - 32768 cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml b/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml index cd36bfa31a3..663d241b580 100644 --- a/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +++ b/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml @@ -27,4 +27,4 @@ kv_cache_config: - 32768 cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml b/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml index e3703824c14..1ba98445458 100644 --- a/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml +++ b/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml @@ -19,7 +19,7 @@ moe_config: cuda_graph_config: enable_padding: true cache_transceiver_config: - backend: ucx + backend: UCX max_tokens_in_buffer: 65536 print_iter_log: false stream_interval: 10 diff --git a/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml b/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml index 07d979a6fd1..87bab09fd48 100644 --- a/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml +++ b/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml @@ -21,7 +21,7 @@ cuda_graph_config: max_batch_size: 32 enable_padding: true cache_transceiver_config: - backend: ucx + backend: UCX max_tokens_in_buffer: 65536 print_iter_log: false stream_interval: 10 diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml index 171df484d8f..019cac5ac64 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml @@ -49,4 +49,4 @@ cuda_graph_config: print_iter_log: true cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml index ce3059f0b49..5b978deece1 100644 --- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml +++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml @@ -34,4 +34,4 @@ kv_cache_config: enable_block_reuse: false cache_transceiver_config: - backend: default + backend: DEFAULT diff --git a/components/backends/trtllm/engine_configs/multimodal/agg.yaml b/components/backends/trtllm/engine_configs/multimodal/agg.yaml index a2b90336792..754f8ce759d 100644 --- a/components/backends/trtllm/engine_configs/multimodal/agg.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/agg.yaml @@ -26,7 +26,7 @@ kv_cache_config: enable_block_reuse: false cache_transceiver_config: - backend: default + backend: DEFAULT # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': diff --git a/components/backends/trtllm/engine_configs/multimodal/decode.yaml b/components/backends/trtllm/engine_configs/multimodal/decode.yaml index bd90c0b62ee..6dbd676ee44 100644 --- a/components/backends/trtllm/engine_configs/multimodal/decode.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/decode.yaml @@ -26,4 +26,4 @@ kv_cache_config: enable_block_reuse: false cache_transceiver_config: - backend: default \ No newline at end of file + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml b/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml index e94d3ee0d80..262a2be1cc9 100644 --- a/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml @@ -26,4 +26,4 @@ kv_cache_config: enable_block_reuse: false cache_transceiver_config: - backend: default \ No newline at end of file + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml b/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml index 23f54130c47..3d2c1440156 100644 --- a/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml @@ -28,4 +28,4 @@ kv_cache_config: enable_block_reuse: false cache_transceiver_config: - backend: default \ No newline at end of file + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml b/components/backends/trtllm/engine_configs/multimodal/prefill.yaml index d3ad035541b..83a65e8bf30 100644 --- a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/prefill.yaml @@ -28,4 +28,4 @@ kv_cache_config: enable_block_reuse: false cache_transceiver_config: - backend: default \ No newline at end of file + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/prefill.yaml b/components/backends/trtllm/engine_configs/prefill.yaml index a7b8d3aaa01..4996c1fdc61 100644 --- a/components/backends/trtllm/engine_configs/prefill.yaml +++ b/components/backends/trtllm/engine_configs/prefill.yaml @@ -27,4 +27,4 @@ kv_cache_config: free_gpu_memory_fraction: 0.85 cache_transceiver_config: - backend: default \ No newline at end of file + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/multimodal_support.md b/components/backends/trtllm/multimodal_support.md index 25fbf7130c3..5fb29038a46 100644 --- a/components/backends/trtllm/multimodal_support.md +++ b/components/backends/trtllm/multimodal_support.md @@ -14,24 +14,6 @@ limitations under the License. # Multimodal Support -> [!Important] -> There are some known issues in tensorrt_llm==1.0.0rc6 version for multimodal support -> It is important to rebuild the dynamo container with a specific version of tensorrt_llm -> commit to use multimodal feature. -## Build Container - -```bash -./container/build.sh --framework trtllm --tensorrtllm-commit b4065d8ca64a64eee9fdc64b39cb66d73d4be47c -``` - -## Run Container - -```bash -./container/run.sh --framework trtllm -it -``` - -## Usage Guide - TRTLLM supports multimodal models with dynamo. You can provide multimodal inputs in the following ways: - By sending image URLs diff --git a/components/backends/trtllm/src/dynamo/trtllm/main.py b/components/backends/trtllm/src/dynamo/trtllm/main.py index d8b35eb5f5b..53b5a73e554 100644 --- a/components/backends/trtllm/src/dynamo/trtllm/main.py +++ b/components/backends/trtllm/src/dynamo/trtllm/main.py @@ -8,7 +8,6 @@ import sys import uvloop -from tensorrt_llm import SamplingParams from tensorrt_llm.llmapi import ( BuildConfig, CapacitySchedulerPolicy, @@ -16,6 +15,7 @@ KvCacheConfig, SchedulerConfig, ) +from tensorrt_llm.llmapi.llm import SamplingParams from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options from tensorrt_llm.llmapi.tokenizer import tokenizer_factory from torch.cuda import device_count diff --git a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py index 79f3d559209..ae9893b5396 100644 --- a/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py +++ b/components/backends/trtllm/src/dynamo/trtllm/request_handlers/handler_base.py @@ -21,8 +21,8 @@ from typing import Optional, Union import torch -from tensorrt_llm import SamplingParams from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams +from tensorrt_llm.llmapi.llm import SamplingParams from dynamo.logits_processing.examples import HelloWorldLogitsProcessor from dynamo.nixl_connect import Connector diff --git a/container/Dockerfile b/container/Dockerfile index eb296eeaa7b..3308c077bc7 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1.10.0 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -35,8 +36,6 @@ ARG ARCH_ALT=x86_64 ARG USE_SCCACHE ARG SCCACHE_BUCKET="" ARG SCCACHE_REGION="" -ARG AWS_ACCESS_KEY_ID="" -ARG AWS_SECRET_ACCESS_KEY="" # NIXL configuration ARG NIXL_UCX_REF=v1.19.0 @@ -58,8 +57,6 @@ ARG PYTHON_VERSION ARG USE_SCCACHE ARG SCCACHE_BUCKET ARG SCCACHE_REGION -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY ARG NIXL_UCX_REF ARG NIXL_REF @@ -164,7 +161,9 @@ ENV PATH=/usr/local/bin/etcd/:$PATH ################################## # Build and install UCX -RUN rm -rf /opt/hpcx/ucx && \ +RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ + --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ + rm -rf /opt/hpcx/ucx && \ rm -rf /usr/local/ucx && \ echo "Building UCX with reference $NIXL_UCX_REF" && \ cd /usr/local/src && \ @@ -214,7 +213,9 @@ ENV NIXL_SRC_DIR=/opt/nixl \ NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins # Build and install NIXL -RUN git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \ +RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ + --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ + git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \ cd ${NIXL_SRC_DIR} && \ if [ "$ARCH" = "arm64" ]; then \ nixl_build_args="-Ddisable_gds_backend=true"; \ @@ -230,7 +231,9 @@ RUN git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl. # Build NIXL Python module # TODO OPS-590: Move gds_path selection based on arch into NIXL build and re-enable gds backend for arm64 -RUN if [ "$ARCH" = "arm64" ]; then \ +RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ + --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ + if [ "$ARCH" = "arm64" ]; then \ cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl \ --config-settings=setup-args="-Ddisable_gds_backend=true"; \ else \ @@ -272,8 +275,6 @@ ARG ENABLE_KVBM ARG USE_SCCACHE ARG SCCACHE_BUCKET ARG SCCACHE_REGION -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY WORKDIR /opt/dynamo @@ -317,7 +318,9 @@ COPY lib/ /opt/dynamo/lib/ COPY components/ /opt/dynamo/components/ # Build wheels -RUN uv build --wheel --out-dir /opt/dynamo/dist && \ +RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ + --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ + uv build --wheel --out-dir /opt/dynamo/dist && \ cd /opt/dynamo/lib/bindings/python && \ uv pip install maturin[patchelf] && \ if [ "$ENABLE_KVBM" = "true" ]; then \ diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index 6bf259e2a32..b8a1f5b0407 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -140,7 +140,6 @@ COPY --from=trtllm_wheel . /trtllm_wheel/ # Note: TensorRT needs to be uninstalled before installing the TRTLLM wheel # because there might be mismatched versions of TensorRT between the NGC PyTorch # and the TRTLLM wheel. -# Locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc6 RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ pip uninstall -y tensorrt && \ if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \ @@ -148,9 +147,6 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ WHEEL_FILE=$(find /trtllm_wheel -name "*.whl" | head -n 1); \ if [ -n "$WHEEL_FILE" ]; then \ pip install "$WHEEL_FILE"; \ - if [ "$ARCH" = "amd64" ]; then \ - pip install "triton==3.3.1"; \ - fi; \ else \ echo "No wheel file found in /trtllm_wheel directory."; \ exit 1; \ @@ -158,9 +154,6 @@ RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true && \ else \ # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI pip install --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}"; \ - if [ "$ARCH" = "amd64" ]; then \ - pip install "triton==3.3.1"; \ - fi; \ fi # Install test dependencies @@ -477,12 +470,7 @@ COPY --from=dev /workspace/target/release/metrics /usr/local/bin/metrics # NOTE: If a package (tensorrt_llm) exists on both --index-url and --extra-index-url, # uv will prioritize the --extra-index-url, unless --index-strategy unsafe-best-match # is also specified. So set the configurable index as a --extra-index-url for prioritization. -# NOTE: locking triton version to 3.3.1 as 3.4.0 breaks tensorrt-llm 1.0.0rc6 -# NOTE: locking cuda-python version to <13 to avoid breaks with tensorrt-llm 1.0.0rc6. This -# can be removed after https://github.com/NVIDIA/TensorRT-LLM/pull/6703 is merged -# we upgrade to a published pip wheel containing this change. -RUN python3 -m pip install --no-cache-dir --break-system-packages "cuda-python>=12,<13" && \ - python3 -m pip install --no-cache-dir --break-system-packages --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" && \ +RUN python3 -m pip install --no-cache-dir --break-system-packages --extra-index-url "${TENSORRTLLM_INDEX_URL}" "${TENSORRTLLM_PIP_WHEEL}" && \ python3 -m pip install --no-cache-dir --break-system-packages \ /workspace/wheelhouse/ai_dynamo_runtime*cp312*.whl \ /workspace/wheelhouse/ai_dynamo*any.whl \ diff --git a/container/Dockerfile.vllm b/container/Dockerfile.vllm index 1503da9a275..9061c6607ff 100644 --- a/container/Dockerfile.vllm +++ b/container/Dockerfile.vllm @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1.10.0 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -20,8 +21,6 @@ ARG TORCH_BACKEND="cu128" ARG USE_SCCACHE ARG SCCACHE_BUCKET="" ARG SCCACHE_REGION="" -ARG AWS_ACCESS_KEY_ID="" -ARG AWS_SECRET_ACCESS_KEY="" # Match 0.10.1.1 vLLM release # https://github.com/vllm-project/vllm/releases/tag/v0.10.1.1 @@ -121,8 +120,6 @@ ARG USE_SCCACHE ARG ARCH_ALT ARG SCCACHE_BUCKET ARG SCCACHE_REGION -ARG AWS_ACCESS_KEY_ID="" -ARG AWS_SECRET_ACCESS_KEY="" ENV ARCH_ALT=${ARCH_ALT} RUN if [ "$USE_SCCACHE" = "true" ]; then \ @@ -139,6 +136,8 @@ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \ # Install VLLM and related dependencies RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ --mount=type=cache,target=/root/.cache/uv \ + --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ + --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \ # TODO - split vllm, DeepEP, DeepGeMM, PPLX installs # Should be able to select how you want your build to go cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \ diff --git a/container/build.sh b/container/build.sh index 08849b3b02d..a095db4c9aa 100755 --- a/container/build.sh +++ b/container/build.sh @@ -89,7 +89,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" # TensorRT-LLM commit to use for building the trtllm wheel if not provided. # Important Note: This commit is not used in our CI pipeline. See the CI # variables to learn how to run a pipeline with a specific commit. -DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="a16ba6445c61ed70e7aadfe787d6f316bb422652" +DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e81c50dbd2811ec858eccc2c71b5e7a330ff7e24" TRTLLM_COMMIT="" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TRTLLM_GIT_URL="" @@ -98,7 +98,7 @@ TRTLLM_GIT_URL="" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package. -DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc6" +DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc3" TENSORRTLLM_PIP_WHEEL="" @@ -602,8 +602,8 @@ if [ "$USE_SCCACHE" = true ]; then BUILD_ARGS+=" --build-arg USE_SCCACHE=true" BUILD_ARGS+=" --build-arg SCCACHE_BUCKET=${SCCACHE_BUCKET}" BUILD_ARGS+=" --build-arg SCCACHE_REGION=${SCCACHE_REGION}" - BUILD_ARGS+=" --build-arg AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}" - BUILD_ARGS+=" --build-arg AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}" + BUILD_ARGS+=" --secret id=aws-key-id,env=AWS_ACCESS_KEY_ID" + BUILD_ARGS+=" --secret id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY" fi LATEST_TAG="--tag dynamo:latest-${FRAMEWORK,,}" diff --git a/docs/support_matrix.md b/docs/support_matrix.md index f6019c003aa..c6dc81858ef 100644 --- a/docs/support_matrix.md +++ b/docs/support_matrix.md @@ -67,7 +67,7 @@ If you are using a **GPU**, the following GPU models and architectures are suppo | **Build Dependency** | **Version** | | :------------------- | :------------------------------------------------------------------------------- | | **Base Container** | [25.03](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda-dl-base/tags) | -| **TensorRT-LLM** | 1.0.0rc6 | +| **TensorRT-LLM** | 1.1.0rc3 | | **NIXL** | 0.4.1 | | **vLLM** | 0.10.1.1 | | **SGLang** | 0.5.0rc2 | diff --git a/pyproject.toml b/pyproject.toml index 091b2910577..f8878cf55a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" [project.optional-dependencies] trtllm =[ "uvloop", - "tensorrt-llm==1.0.0rc6", - "triton==3.3.1", # locking triton as version 3.4.0 breaks tensorrt-llm 1.0.0rc6 + "tensorrt-llm==1.1.0rc3", ] vllm = [