diff --git a/README.md b/README.md index 607cffb9e74..51630be811f 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ It is recommended to use [NGC PyTorch Container](https://catalog.ngc.nvidia.com/ > [!Note] > Ensure that you select a PyTorch container image version that matches the version of TensorRT-LLM you are using. -> For example, if you are using `tensorrt-llm==1.1.0rc3`, use the PyTorch container image version `25.06`. +> For example, if you are using `tensorrt-llm==1.1.0rc5`, use the PyTorch container image version `25.06`. > To find the correct PyTorch container version for your desired `tensorrt-llm` release, visit the [TensorRT-LLM Dockerfile.multi](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/Dockerfile.multi) on GitHub. Switch to the branch that matches your `tensorrt-llm` version, and look for the `BASE_TAG` line to identify the recommended PyTorch container tag. > [!Important] diff --git a/container/build.sh b/container/build.sh index a095db4c9aa..cee547193c6 100755 --- a/container/build.sh +++ b/container/build.sh @@ -89,7 +89,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" # TensorRT-LLM commit to use for building the trtllm wheel if not provided. # Important Note: This commit is not used in our CI pipeline. See the CI # variables to learn how to run a pipeline with a specific commit. -DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e81c50dbd2811ec858eccc2c71b5e7a330ff7e24" +DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="0c9430e5a530ba958fc9dca561a3ad865ad9f492" TRTLLM_COMMIT="" TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" TRTLLM_GIT_URL="" @@ -98,7 +98,7 @@ TRTLLM_GIT_URL="" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package. -DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc3" +DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.1.0rc5" TENSORRTLLM_PIP_WHEEL="" diff --git a/docs/guides/run_kvbm_in_trtllm.md b/docs/guides/run_kvbm_in_trtllm.md index 760dfa71380..d5d846c9054 100644 --- a/docs/guides/run_kvbm_in_trtllm.md +++ b/docs/guides/run_kvbm_in_trtllm.md @@ -27,7 +27,7 @@ To learn what KVBM is, please check [here](https://docs.nvidia.com/dynamo/latest > - KVBM only supports TensorRT-LLM’s PyTorch backend. > - To enable disk cache offloading, you must first enable a CPU memory cache offloading. > - Disable partial reuse `enable_partial_reuse: false` in the LLM API config’s `kv_connector_config` to increase offloading cache hits. -> - KVBM requires TensorRT-LLM at commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer. +> - KVBM requires TensorRT-LLM v1.1.0rc5 or newer. > - Enabling KVBM metrics with TensorRT-LLM is still a work in progress. ## Quick Start @@ -38,12 +38,8 @@ To use KVBM in TensorRT-LLM, you can follow the steps below: # start up etcd for KVBM leader/worker registration and discovery docker compose -f deploy/docker-compose.yml up -d -# Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit dcd110cfac07e577ce01343c455917832b0f3d5e or newer. -# When building with the --tensorrtllm-commit option, you may notice that https://github.com keeps prompting for a username and password. -# This happens because cloning TensorRT-LLM can hit GitHub’s rate limit. -# To work around this, you can keep pressing "Enter" or "Return.". -# Setting "export GIT_LFS_SKIP_SMUDGE=1" may also reduce the number of prompts. -./container/build.sh --framework trtllm --tensorrtllm-commit dcd110cfac07e577ce01343c455917832b0f3d5e --enable-kvbm +# Build a container that includes TensorRT-LLM and KVBM. +./container/build.sh --framework trtllm --enable-kvbm # launch the container ./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds diff --git a/docs/support_matrix.md b/docs/support_matrix.md index c6dc81858ef..7fcca6bf5bd 100644 --- a/docs/support_matrix.md +++ b/docs/support_matrix.md @@ -67,7 +67,7 @@ If you are using a **GPU**, the following GPU models and architectures are suppo | **Build Dependency** | **Version** | | :------------------- | :------------------------------------------------------------------------------- | | **Base Container** | [25.03](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda-dl-base/tags) | -| **TensorRT-LLM** | 1.1.0rc3 | +| **TensorRT-LLM** | 1.1.0rc5 | | **NIXL** | 0.4.1 | | **vLLM** | 0.10.1.1 | | **SGLang** | 0.5.0rc2 | diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock index a2ab9a108fd..ba935069d41 100644 --- a/lib/bindings/python/Cargo.lock +++ b/lib/bindings/python/Cargo.lock @@ -1469,6 +1469,7 @@ dependencies = [ "rustpython-parser", "serde", "serde_json", + "tokio", "tracing", "uuid", ] diff --git a/pyproject.toml b/pyproject.toml index 90a89aa8d2f..eff8b4f4264 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git" [project.optional-dependencies] trtllm =[ "uvloop", - "tensorrt-llm==1.1.0rc3", + "tensorrt-llm==1.1.0rc5", ] vllm = [