fix: devcontainer.json typo from b6b3a76 (#2976)

Signed-off-by: Keiven Chang <[email protected]> Signed-off-by: Chi McIsaac <[email protected]>
ai-dynamo · PeaBrane · Sep 18, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
commit 6f8e1f47d2b33e8731a0f7aa20f8c76871a31845
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -7,7 +7,7 @@
     "name": "NVIDIA Dynamo Dev Container Development",
     "remoteUser": "ubuntu", // Matches our container user
     "updateRemoteUserUID": true, // Updates the UID of the remote user to match the host user, avoids permission errors
-    "image": "dynamo:latest-vllm-dev", // Use the latest VLLM dev image
+    "image": "dynamo:latest-vllm-local-dev", // Use the latest VLLM dev image
     "runArgs": [
         "--gpus=all",
         "--network=host",

diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -215,6 +215,14 @@ python3 -m dynamo.sglang ... --migration-limit=3
 
 This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/architecture/request_migration.md) documentation for details on how this works.
 
+## Hashing Consistency for KV Events
+
+When emitting KV events for KV-aware routing, ensure deterministic hashing across processes to prevent radix tree mismatches in the router:
+
+- Set `PYTHONHASHSEED=0` for all SGLang processes.
+- Ensure the block IDs published in events are deterministic across ranks and restarts.
+- See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for background and a reference test vector check to validate your environment.
+
 ## Advanced Examples
 
 Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example!
@@ -237,4 +245,4 @@ We currently provide deployment examples for Kubernetes and SLURM.
 - **[Deploying Dynamo with SGLang on Kubernetes](deploy/README.md)**
 
 ## SLURM
-- **[Deploying Dynamo with SGLang on SLURM](slurm_jobs/README.md)**
+- **[Deploying Dynamo with SGLang on SLURM](slurm_jobs/README.md)**
diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md
@@ -199,6 +199,14 @@ NOTE: To send a request to a multi-node deployment, target the node which is run
 To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
 `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh)
 
+## Hashing Consistency for KV Events
+
+When using KV-aware routing with TensorRT-LLM, ensure deterministic event identifiers across processes and runs so the router can correctly apply parent links and removals:
+
+- Set a stable `--random-seed` where applicable.
+- Ensure the block IDs used in KV events are deterministic across ranks and restarts.
+- See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for details and a reference test vector check used by the router.
+
 
 ## Disaggregation Strategy
 

diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md
@@ -168,6 +168,19 @@ See `args.py` for the full list of configuration options and their defaults.
 
 The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the vLLM CLI args points to running 'vllm serve --help' to see what CLI args can be added. We use the same argument parser as vLLM.
 
+### Hashing Consistency for KV Events
+
+When using KV-aware routing, ensure deterministic hashing across processes to avoid radix tree mismatches:
+
+- Set `PYTHONHASHSEED=0` for all vLLM processes.
+- If your vLLM version supports it, configure a deterministic prefix caching algorithm, for example:
+
+```bash
+vllm serve ... --enable-prefix-caching --prefix-caching-algo sha256
+```
+
+The router uses a canonical xxh3-64 hash (seed=1337) for matching local blocks. See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for details and a reference test vector check.
+
 ## Request Migration
 
 You can enable [request migration](../../../docs/architecture/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:

diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -133,6 +133,18 @@ def parse_args() -> Config:
         )
         engine_args.enable_prefix_caching = True
 
+    # Recommend deterministic hashing setup when prefix caching is enabled
+    if engine_args.enable_prefix_caching:
+        phs = os.environ.get("PYTHONHASHSEED")
+        if not phs or phs == "random":
+            logger.warning(
+                "PYTHONHASHSEED is unset or random. For deterministic KV block IDs across processes, set PYTHONHASHSEED=0."
+            )
+        # Best-effort guidance on vLLM hashing mode
+        logger.info(
+            "If your vLLM version supports it, consider setting --prefix-caching-algo sha256 to use a deterministic prefix hashing implementation."
+        )
+
     config = Config()
     config.model = args.model
     if args.served_model_name:

diff --git a/docs/guides/kv_events_hashing.md b/docs/guides/kv_events_hashing.md
@@ -0,0 +1,66 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# KV Events & Hashing Consistency
+
+This guide explains how Dynamo computes and consumes KV cache block hashes, and how to ensure consistent hashing across engines, processes, and nodes.
+
+## Canonical Hashing (Router)
+
+- Algorithm: xxh3-64
+- Seed: 1337
+- Token encoding: u32 tokens serialized via little-endian `to_le_bytes`
+- Scope: Computes "local block hashes" used by the router/indexer to match cached prefixes.
+
+Reference implementations:
+- Rust (primary): `lib/llm/src/kv_router/indexer.rs` (`compute_block_hash_for_seq`)
+- Python binding: `dynamo._core.compute_block_hash_for_seq_py` (delegates to the Rust implementation)
+
+Note:
+- `kv_block_size` must be identical between the engine that publishes KV events and the router. A mismatch will yield different local block hashes and break prefix matching.
+
+Reference test vector check:
+- Tokens `[1,2,3,4]`, `kv_block_size=4` → `14643705804678351452`
+
+## Engine Block IDs vs Router Hashes
+
+- LocalBlockHash (router): Canonical value used for KV matching.
+- ExternalSequenceBlockHash (engine): Engine-provided block identifiers to link parent/child and removals; MUST be deterministic within a deployment.
+
+The router recomputes LocalBlockHash from tokens on ingest. If parent links or removals reference unknown ExternalSequenceBlockHash, the router logs a warning (or error if `DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1`).
+
+## Engine Configuration Tips
+
+The goal is to ensure that emitted KV events are deterministic across ranks/restarts.
+
+General:
+- Set `PYTHONHASHSEED=0` for Python processes to eliminate hash randomization.
+
+vLLM:
+- If your version supports it, set a deterministic prefix-caching algorithm, e.g. `--prefix-caching-algo sha256`.
+- Keep `enable_prefix_caching=True` when emitting KV events.
+
+SGLang:
+- Ensure events use deterministic block IDs across processes. If applicable, set `PYTHONHASHSEED=0`.
+
+TensorRT-LLM:
+- Use a stable `--random-seed` where applicable and validate that KV event block IDs are deterministic across launches.
+
+## Observability and Enforcement
+
+- Warnings on router when parent link is missing or a removal refers to an unknown block id include remediation hints.
+- Set `DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1` to promote these warnings to error-level logs. This does not abort processing; the router still skips the offending operation.
+
+## Quick Self-Check
+
+From Python:
+
+```python
+from dynamo._core import compute_block_hash_for_seq_py
+assert compute_block_hash_for_seq_py([1,2,3,4], 4)[0] == 14643705804678351452
+```
+
+If this check fails across nodes, verify environment and engine flags per above.
+This self‑check only validates the router’s canonical hashing path (known‑answer test); it does not validate that engine‑emitted block IDs are deterministic.
diff --git a/docs/index.rst b/docs/index.rst
@@ -50,12 +50,12 @@ Quickstart
    :hidden:
    :caption: Kubernetes Deployment
 
-   Quickstart (K8s) <../guides/dynamo_deploy/README.md>
-   Dynamo Operator <../guides/dynamo_deploy/dynamo_operator.md>
-   Metrics <../guides/dynamo_deploy/metrics.md>
-   Logging <../guides/dynamo_deploy/logging.md>
-   Multinode <../guides/dynamo_deploy/multinode-deployment.md>
-   Minikube Setup <../guides/dynamo_deploy/minikube.md>
+   Quickstart (K8s) <guides/dynamo_deploy/README.md>
+   Dynamo Operator <guides/dynamo_deploy/dynamo_operator.md>
+   Metrics <guides/dynamo_deploy/metrics.md>
+   Logging <guides/dynamo_deploy/logging.md>
+   Multinode <guides/dynamo_deploy/multinode-deployment.md>
+   Minikube Setup <guides/dynamo_deploy/minikube.md>
 
 .. toctree::
    :hidden:
@@ -65,6 +65,7 @@ Quickstart
    Router <components/router/README>
    Planner <architecture/planner_intro>
    KVBM <architecture/kvbm_intro>
+   KV Events & Hashing <guides/kv_events_hashing.md>
 
 .. toctree::
    :hidden:

diff --git a/lib/bindings/python/tests/test_kv_bindings.py b/lib/bindings/python/tests/test_kv_bindings.py
@@ -21,6 +21,7 @@
 
 from dynamo.llm import (
     ApproxKvIndexer,
+    compute_block_hash_for_seq_py,
     ForwardPassMetrics,
     KvEventPublisher,
     KvIndexer,
@@ -282,3 +283,11 @@ async def metrics_publisher_task(kv_listener, expected_metrics):
     # test can discover them.
     metrics_publisher.publish(metrics)
     await metrics_publisher.create_endpoint(kv_listener)
+
+
+def test_block_hash_ref_vector():
+    # Reference test vector check: tokens [1,2,3,4], kv_block_size=4
+    tokens = [1, 2, 3, 4]
+    out = compute_block_hash_for_seq_py(tokens, 4)
+    assert isinstance(out, list) and len(out) == 1
+    assert out[0] == 14643705804678351452
diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs
@@ -335,12 +335,24 @@ impl RadixTree {
                 let mut current = match current {
                     Some(current) => current.clone(),
                     None => {
-                        tracing::warn!(
-                            worker_id = worker_id.to_string(),
-                            id,
-                            parent_hash = ?op.parent_hash,
-                            "Failed to find parent block; skipping store operation"
-                        );
+                        let enforce = std::env::var("DYN_KV_ENFORCE_ENGINE_HASH_STABILITY")
+                            .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
+                            .unwrap_or(false);
+                        if enforce {
+                            tracing::error!(
+                                worker_id,
+                                id,
+                                parent_hash = ?op.parent_hash,
+                                "Missing parent block; skipping store. Likely inconsistent hashing across processes. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).",
+                            );
+                        } else {
+                            tracing::warn!(
+                                worker_id,
+                                id,
+                                parent_hash = ?op.parent_hash,
+                                "Missing parent block; skipping store. Set DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1 to log as error. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).",
+                            );
+                        }
                         return;
                     }
                 };
@@ -389,11 +401,22 @@ impl RadixTree {
                     let entry = match worker_lookup.get(&block) {
                         Some(entry) => entry.clone(),
                         None => {
-                            tracing::warn!(
-                                worker_id = worker_id.to_string(),
-                                id,
-                                "Failed to find block to remove; skipping remove operation"
-                            );
+                            let enforce = std::env::var("DYN_KV_ENFORCE_ENGINE_HASH_STABILITY")
+                                .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
+                                .unwrap_or(false);
+                            if enforce {
+                                tracing::error!(
+                                    worker_id,
+                                    id,
+                                    "Missing block to remove; skipping removal. Likely inconsistent hashing across processes. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).",
+                                );
+                            } else {
+                                tracing::warn!(
+                                    worker_id,
+                                    id,
+                                    "Missing block to remove; skipping removal. Set DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1 to log as error. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).",
+                                );
+                            }
                             continue;
                         }
                     };
@@ -1163,6 +1186,16 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_block_hash_ref_vector() {
+        // Reference test vector check: tokens [1,2,3,4], kv_block_size=4
+        // Should equal the known xxh3-64(seed=1337) value below.
+        let tokens: Vec<u32> = vec![1, 2, 3, 4];
+        let hashes = compute_block_hash_for_seq(&tokens, 4);
+        assert_eq!(hashes.len(), 1);
+        assert_eq!(hashes[0].0, 14643705804678351452u64);
+    }
+
     fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>) -> RouterEvent {
         RouterEvent {
             worker_id,