ai-dynamo · richardhuo-nv · Aug 30, 2025 · Aug 14, 2025 · Aug 22, 2025 · Aug 23, 2025
@@ -66,7 +66,7 @@ jobs:
           docker run -v ${{ github.workspace }}:/workspace -w /workspace \
             --name ${{ env.CONTAINER_ID }}_pytest \
             ${{ steps.define_image_tag.outputs.image_tag }} \
-            bash -c "pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\""
+            bash -c "pytest --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ env.PYTEST_MARKS }}\" --ignore /workspace/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector "
       - name: Copy test report from test Container
         if: always()
         run: |
@@ -89,4 +89,4 @@ jobs:
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:
           name: Event File
-          path: ${{ github.event_path }}
+          path: ${{ github.event_path }}
@@ -15,6 +15,11 @@ jobs:
     steps:
       - name: Check out repository
         uses: actions/checkout@v4
+        with:
+          # For pull_request events, use the PR head (commit from the contributor's branch/repo)
+          repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+          fetch-depth: 0
 
       # Cache lychee results (e.g. to avoid hitting rate limits)
       # https://lychee.cli.rs/github_action_recipes/caching/

diff --git a/docs/guides/run_kvbm_in_trtllm.md b/docs/guides/run_kvbm_in_trtllm.md
@@ -0,0 +1,121 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Running KVBM in TensorRT-LLM
+
+This guide explains how to leverage KVBM (KV Block Manager) to mange KV cache and do KV offloading in TensorRT-LLM (trtllm).
+
+To learn what KVBM is, please check [here](https://docs.nvidia.com/dynamo/latest/architecture/kvbm_intro.html)
+
+> [!Note]
+> - Ensure that `etcd` and 'nats' are running before starting.
+> - KVBM does not currently support CUDA graphs in TensorRT-LLM.
+> - KVBM only supports TensorRT-LLM’s PyTorch backend.
+> - KVBM requires TensorRT-LLM at commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer.
+
+## Quick Start
+
+To use KVBM in TensorRT-LLM, you can follow the steps below:
+
+```bash
+# start up etcd for KVBM leader/worker registration and discovery
+docker compose -f deploy/docker-compose.yml up -d
+
+# Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer.
+./container/build.sh --framework trtllm --tensorrtllm-commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 --enable-kvbm
+
+# launch the container
+./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds
+
+# enable kv offloading to CPU memory
+# 60 means 60GB of pinned CPU memory would be used
+export DYN_KVBM_CPU_CACHE_GB=60
+
+# enable kv offloading to disk
+# 20 means 20GB of disk would be used
+export DYN_KVBM_DISK_CACHE_GB=20
+
+# Allocating memory and disk storage can take some time.
+# We recommend setting a higher timeout for leader–worker initialization.
+# 1200 means 1200 seconds timeout
+export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=1200
+```
+
+```bash
+# write an example LLM API config
+cat > "/tmp/kvbm_llm_api_config.yaml" <<EOF
+backend: pytorch
+cuda_graph_config: null
+kv_cache_config:
+  enable_partial_reuse: false
+  free_gpu_memory_fraction: 0.80
+kv_connector_config:
+  connector_module: dynamo.llm.trtllm_integration.connector
+  connector_scheduler_class: DynamoKVBMConnectorLeader
+  connector_worker_class: DynamoKVBMConnectorWorker
+EOF
+
+# serve an example LLM model
+trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
+
+# make a call to LLM
+curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "messages": [
+    {
+        "role": "user",
+        "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
+    }
+    ],
+    "stream":false,
+    "max_tokens": 30
+  }'
+```
+
+## Enable and View KVBM Metrics
+
+Follow below steps to enable metrics collection and view via Grafana dashboard:
+```bash
+# Start the basic services (etcd & natsd), along with Prometheus and Grafana
+docker compose -f deploy/docker-compose.yml --profile metrics up -d
+
+# write an example LLM API config
+cat > "/tmp/kvbm_llm_api_config.yaml" <<EOF
+backend: pytorch
+cuda_graph_config: null
+kv_cache_config:
+  enable_partial_reuse: false
+  free_gpu_memory_fraction: 0.80
+kv_connector_config:
+  connector_module: dynamo.llm.trtllm_integration.connector
+  connector_scheduler_class: DynamoKVBMConnectorLeader
+  connector_worker_class: DynamoKVBMConnectorWorker
+EOF
+
+# serve an example LLM model
+trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
+
+# start trtllm-serve with DYN_SYSTEM_ENABLED set to true and DYN_SYSTEM_PORT set to 6880
+# NOTE: Ensure ports 6880 (KVBM worker metrics) and 6881 (KVBM leader metrics) are available.
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=6880 trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
+
+# optional if firewall blocks KVBM metrics ports to send prometheus metrics
+sudo ufw allow 6880/tcp
+sudo ufw allow 6881/tcp
+```
+
+View grafana metrics via http://localhost:3001 (default login: dynamo/dynamo) and look for KVBM Dashboard
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 use super::*;
+use anyhow::Result;
 use dynamo_llm::block_manager::block::{
     data::logical::distributed_leader_worker::DistributedLeaderWorkerResources, locality::Logical,
 };
@@ -220,3 +221,113 @@ impl BlockManager {
         &self.inner
     }
 }
+
+#[derive(Default)]
+pub struct BlockManagerBuilder {
+    worker_id: u64,
+    leader: Option<distributed::KvbmLeader>,
+    page_size: usize,
+    disable_device_pool: bool,
+}
+
+impl BlockManagerBuilder {
+    pub fn new() -> Self {
+        Self {
+            page_size: 32, // default consistent with BlockManager::new
+            ..Default::default()
+        }
+    }
+
+    pub fn worker_id(mut self, id: u64) -> Self {
+        self.worker_id = id;
+        self
+    }
+    pub fn page_size(mut self, ps: usize) -> Self {
+        self.page_size = ps;
+        self
+    }
+    pub fn leader(mut self, l: distributed::KvbmLeader) -> Self {
+        self.leader = Some(l);
+        self
+    }
+    pub fn disable_device_pool(mut self, yes: bool) -> Self {
+        self.disable_device_pool = yes;
+        self
+    }
+
+    /// Async build (call from an async context).
+    pub async fn build(self) -> Result<BlockManager> {
+        let worker_id = self.worker_id;
+        let leader = self.leader.ok_or_else(|| {
+            anyhow::anyhow!("leader is required (runtime is always taken from leader)")
+        })?;
+
+        // Get (inner leader handle, runtime) from the provided leader.
+        let (leader_inner, drt) = leader.dissolve();
+
+        let cancel_token = CancellationToken::new();
+
+        // Runtime & model config
+        let runtime_config = dynamo_llm::block_manager::KvManagerRuntimeConfig::builder()
+            .worker_id(worker_id)
+            .cancellation_token(cancel_token.clone())
+            .build()?;
+
+        let mut config =
+            dynamo_llm::block_manager::KvBlockManagerConfig::builder().runtime(runtime_config);
+
+        let model_config = dynamo_llm::block_manager::KvManagerModelConfig::builder()
+            .num_layers(1)
+            .outer_dim(1)
+            .page_size(self.page_size)
+            .inner_dim(1)
+            .build()?;
+
+        config = config.model(model_config);
+
+        // Layouts derived from leader’s counts
+        if !self.disable_device_pool {
+            config = config.device_layout(
+                dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
+                    .num_blocks(leader_inner.num_device_blocks())
+                    .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
+                    .build()?,
+            );
+        }
+
+        if leader_inner.num_host_blocks() > 0 {
+            config = config.host_layout(
+                dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
+                    .num_blocks(leader_inner.num_host_blocks())
+                    .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
+                    .build()?,
+            );
+        }
+
+        if leader_inner.num_disk_blocks() > 0 {
+            config = config.disk_layout(
+                dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
+                    .num_blocks(leader_inner.num_disk_blocks())
+                    .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded))
+                    .build()?,
+            );
+        }
+
+        let config = config.build()?;
+
+        let resources =
+            DistributedLeaderWorkerResources::new(Some(leader_inner), cancel_token.child_token())?;
+
+        let inner = dynamo_llm::block_manager::KvBlockManager::<
+            Logical<DistributedLeaderWorkerResources>,
+            BasicMetadata,
+        >::new(config, resources)
+        .await?;
+
+        Ok(BlockManager {
+            inner,
+            drt,
+            _controller: None,
+        })
+    }
+}
@@ -8,5 +8,5 @@ mod utils;
 mod worker;
 
 pub use leader::KvbmLeader;
-pub use utils::get_barrier_id;
+pub use utils::get_barrier_id_prefix;
 pub use worker::{KvbmWorker, VllmTensor};
@@ -2,10 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use super::*;
-use utils::get_barrier_id;
+use utils::get_barrier_id_prefix;
 
 use derive_getters::Dissolve;
-use llm_rs::block_manager::distributed::{KvbmLeader as KvbmLeaderImpl, KvbmLeaderConfig};
+use llm_rs::block_manager::distributed::{
+    KvbmLeader as KvbmLeaderImpl, KvbmLeaderConfig, KvbmLeaderNumBlocksConfig,
+};
 
 const CPU_CACHE: &str = "DYN_KVBM_CPU_CACHE_GB";
 const CPU_CACHE_OVERRIDE: &str = "DYN_KVBM_CPU_CACHE_OVERRIDE_NUM_BLOCKS";
@@ -16,15 +18,32 @@ const DISK_CACHE_OVERRIDE: &str = "DYN_KVBM_DISK_CACHE_OVERRIDE_NUM_BLOCKS";
 const LEADER_WORKER_INIT_TIMEOUT_SECS: &str = "DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS";
 const DEFAULT_INIT_TIMEOUT_SECS: u64 = 120;
 
-fn compute_num_blocks(cache_size_key: &str, override_key: &str, bytes_per_block: usize) -> usize {
-    if let Ok(override_num_blocks) = std::env::var(override_key) {
-        override_num_blocks.parse::<usize>().unwrap_or(0)
-    } else {
-        let cache_size_gb = std::env::var(cache_size_key)
-            .unwrap_or_default()
-            .parse::<f64>()
-            .unwrap_or(0.0);
-        ((cache_size_gb * 1_000_000_000.0) / bytes_per_block as f64) as usize
+fn read_env_usize(key: &str) -> Option<usize> {
+    std::env::var(key).ok()?.trim().parse::<usize>().ok()
+}
+
+fn read_cache_size_float(key: &str) -> f64 {
+    std::env::var(key)
+        .unwrap_or_default()
+        .parse::<f64>()
+        .unwrap_or(0.0)
+}
+
+fn get_blocks_config(cache_size_key: &str, override_key: &str) -> KvbmLeaderNumBlocksConfig {
+    if let Some(nblocks) = read_env_usize(override_key) {
+        // Optional: still read cache size for observability, but override takes precedence.
+        let cache_gb: f64 = read_cache_size_float(cache_size_key);
+        return KvbmLeaderNumBlocksConfig {
+            cache_size_in_gb: cache_gb,
+            num_blocks_overriden: nblocks,
+        };
+    }
+
+    // No override -> compute from cache size (in GB)
+    let cache_gb: f64 = read_cache_size_float(cache_size_key);
+    KvbmLeaderNumBlocksConfig {
+        cache_size_in_gb: cache_gb,
+        num_blocks_overriden: 0,
     }
 }
 
@@ -51,22 +70,19 @@ impl KvbmLeader {
 #[pymethods]
 impl KvbmLeader {
     #[new]
-    #[pyo3(signature = (bytes_per_block, world_size, drt))]
-    fn new(bytes_per_block: usize, world_size: usize, drt: DistributedRuntime) -> PyResult<Self> {
-        let num_host_blocks = compute_num_blocks(CPU_CACHE, CPU_CACHE_OVERRIDE, bytes_per_block);
-        let num_disk_blocks = compute_num_blocks(DISK_CACHE, DISK_CACHE_OVERRIDE, bytes_per_block);
-
-        let barrier_id = get_barrier_id();
+    #[pyo3(signature = (world_size, drt))]
+    fn new(world_size: usize, drt: DistributedRuntime) -> PyResult<Self> {
+        let barrier_id_prefix = get_barrier_id_prefix();
         let leader_init_timeout_sec: u64 =
             get_leader_init_timeout_secs(LEADER_WORKER_INIT_TIMEOUT_SECS);
 
         let config = KvbmLeaderConfig::builder()
-            .barrier_id(barrier_id)
-            .num_host_blocks(num_host_blocks)
-            .num_disk_blocks(num_disk_blocks)
+            .barrier_id_prefix(barrier_id_prefix)
             .world_size(world_size)
             .leader_init_timeout_secs(leader_init_timeout_sec)
             .drt(drt.inner().clone())
+            .host_blocks_config(get_blocks_config(CPU_CACHE, CPU_CACHE_OVERRIDE))
+            .disk_blocks_config(get_blocks_config(DISK_CACHE, DISK_CACHE_OVERRIDE))
             .build()
             .map_err(to_pyerr)?;
 

@@ -1,6 +1,9 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-pub fn get_barrier_id() -> String {
-    std::env::var("DYN_KVBM_BARRIER_ID").unwrap_or("kvbm".to_string())
+pub fn get_barrier_id_prefix() -> String {
+    std::env::var("DYN_KVBM_BARRIER_ID_PREFIX")
+        .ok()
+        .filter(|s| !s.trim().is_empty())
+        .unwrap_or_else(|| "kvbm".to_string())
 }