From 6f8e1f47d2b33e8731a0f7aa20f8c76871a31845 Mon Sep 17 00:00:00 2001 From: Chi McIsaac Date: Wed, 10 Sep 2025 01:30:05 -0400 Subject: [PATCH 01/11] fix: devcontainer.json typo from b6b3a767c (#2976) Signed-off-by: Keiven Chang Signed-off-by: Chi McIsaac --- .devcontainer/devcontainer.json | 2 +- components/backends/sglang/README.md | 10 ++- components/backends/trtllm/README.md | 8 +++ components/backends/vllm/README.md | 13 ++++ .../backends/vllm/src/dynamo/vllm/args.py | 12 ++++ docs/guides/kv_events_hashing.md | 66 +++++++++++++++++++ docs/index.rst | 13 ++-- lib/bindings/python/tests/test_kv_bindings.py | 9 +++ lib/llm/src/kv_router/indexer.rs | 55 ++++++++++++---- 9 files changed, 169 insertions(+), 19 deletions(-) create mode 100644 docs/guides/kv_events_hashing.md diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 56f00231287..14d1c331ce4 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,7 +7,7 @@ "name": "NVIDIA Dynamo Dev Container Development", "remoteUser": "ubuntu", // Matches our container user "updateRemoteUserUID": true, // Updates the UID of the remote user to match the host user, avoids permission errors - "image": "dynamo:latest-vllm-dev", // Use the latest VLLM dev image + "image": "dynamo:latest-vllm-local-dev", // Use the latest VLLM dev image "runArgs": [ "--gpus=all", "--network=host", diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md index f1e36d553d6..4bd03ba993d 100644 --- a/components/backends/sglang/README.md +++ b/components/backends/sglang/README.md @@ -215,6 +215,14 @@ python3 -m dynamo.sglang ... --migration-limit=3 This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/architecture/request_migration.md) documentation for details on how this works. +## Hashing Consistency for KV Events + +When emitting KV events for KV-aware routing, ensure deterministic hashing across processes to prevent radix tree mismatches in the router: + +- Set `PYTHONHASHSEED=0` for all SGLang processes. +- Ensure the block IDs published in events are deterministic across ranks and restarts. +- See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for background and a reference test vector check to validate your environment. + ## Advanced Examples Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! @@ -237,4 +245,4 @@ We currently provide deployment examples for Kubernetes and SLURM. - **[Deploying Dynamo with SGLang on Kubernetes](deploy/README.md)** ## SLURM -- **[Deploying Dynamo with SGLang on SLURM](slurm_jobs/README.md)** \ No newline at end of file +- **[Deploying Dynamo with SGLang on SLURM](slurm_jobs/README.md)** diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md index 01c1e69f6bd..1a02aa2639d 100644 --- a/components/backends/trtllm/README.md +++ b/components/backends/trtllm/README.md @@ -199,6 +199,14 @@ NOTE: To send a request to a multi-node deployment, target the node which is run To benchmark your deployment with GenAI-Perf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) +## Hashing Consistency for KV Events + +When using KV-aware routing with TensorRT-LLM, ensure deterministic event identifiers across processes and runs so the router can correctly apply parent links and removals: + +- Set a stable `--random-seed` where applicable. +- Ensure the block IDs used in KV events are deterministic across ranks and restarts. +- See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for details and a reference test vector check used by the router. + ## Disaggregation Strategy diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md index dd372b1197a..4ebcdfe7539 100644 --- a/components/backends/vllm/README.md +++ b/components/backends/vllm/README.md @@ -168,6 +168,19 @@ See `args.py` for the full list of configuration options and their defaults. The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the vLLM CLI args points to running 'vllm serve --help' to see what CLI args can be added. We use the same argument parser as vLLM. +### Hashing Consistency for KV Events + +When using KV-aware routing, ensure deterministic hashing across processes to avoid radix tree mismatches: + +- Set `PYTHONHASHSEED=0` for all vLLM processes. +- If your vLLM version supports it, configure a deterministic prefix caching algorithm, for example: + +```bash +vllm serve ... --enable-prefix-caching --prefix-caching-algo sha256 +``` + +The router uses a canonical xxh3-64 hash (seed=1337) for matching local blocks. See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for details and a reference test vector check. + ## Request Migration You can enable [request migration](../../../docs/architecture/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py index f8b12d5340f..a131c6da556 100644 --- a/components/backends/vllm/src/dynamo/vllm/args.py +++ b/components/backends/vllm/src/dynamo/vllm/args.py @@ -133,6 +133,18 @@ def parse_args() -> Config: ) engine_args.enable_prefix_caching = True + # Recommend deterministic hashing setup when prefix caching is enabled + if engine_args.enable_prefix_caching: + phs = os.environ.get("PYTHONHASHSEED") + if not phs or phs == "random": + logger.warning( + "PYTHONHASHSEED is unset or random. For deterministic KV block IDs across processes, set PYTHONHASHSEED=0." + ) + # Best-effort guidance on vLLM hashing mode + logger.info( + "If your vLLM version supports it, consider setting --prefix-caching-algo sha256 to use a deterministic prefix hashing implementation." + ) + config = Config() config.model = args.model if args.served_model_name: diff --git a/docs/guides/kv_events_hashing.md b/docs/guides/kv_events_hashing.md new file mode 100644 index 00000000000..9b10085db93 --- /dev/null +++ b/docs/guides/kv_events_hashing.md @@ -0,0 +1,66 @@ + + +# KV Events & Hashing Consistency + +This guide explains how Dynamo computes and consumes KV cache block hashes, and how to ensure consistent hashing across engines, processes, and nodes. + +## Canonical Hashing (Router) + +- Algorithm: xxh3-64 +- Seed: 1337 +- Token encoding: u32 tokens serialized via little-endian `to_le_bytes` +- Scope: Computes "local block hashes" used by the router/indexer to match cached prefixes. + +Reference implementations: +- Rust (primary): `lib/llm/src/kv_router/indexer.rs` (`compute_block_hash_for_seq`) +- Python binding: `dynamo._core.compute_block_hash_for_seq_py` (delegates to the Rust implementation) + +Note: +- `kv_block_size` must be identical between the engine that publishes KV events and the router. A mismatch will yield different local block hashes and break prefix matching. + +Reference test vector check: +- Tokens `[1,2,3,4]`, `kv_block_size=4` → `14643705804678351452` + +## Engine Block IDs vs Router Hashes + +- LocalBlockHash (router): Canonical value used for KV matching. +- ExternalSequenceBlockHash (engine): Engine-provided block identifiers to link parent/child and removals; MUST be deterministic within a deployment. + +The router recomputes LocalBlockHash from tokens on ingest. If parent links or removals reference unknown ExternalSequenceBlockHash, the router logs a warning (or error if `DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1`). + +## Engine Configuration Tips + +The goal is to ensure that emitted KV events are deterministic across ranks/restarts. + +General: +- Set `PYTHONHASHSEED=0` for Python processes to eliminate hash randomization. + +vLLM: +- If your version supports it, set a deterministic prefix-caching algorithm, e.g. `--prefix-caching-algo sha256`. +- Keep `enable_prefix_caching=True` when emitting KV events. + +SGLang: +- Ensure events use deterministic block IDs across processes. If applicable, set `PYTHONHASHSEED=0`. + +TensorRT-LLM: +- Use a stable `--random-seed` where applicable and validate that KV event block IDs are deterministic across launches. + +## Observability and Enforcement + +- Warnings on router when parent link is missing or a removal refers to an unknown block id include remediation hints. +- Set `DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1` to promote these warnings to error-level logs. This does not abort processing; the router still skips the offending operation. + +## Quick Self-Check + +From Python: + +```python +from dynamo._core import compute_block_hash_for_seq_py +assert compute_block_hash_for_seq_py([1,2,3,4], 4)[0] == 14643705804678351452 +``` + +If this check fails across nodes, verify environment and engine flags per above. +This self‑check only validates the router’s canonical hashing path (known‑answer test); it does not validate that engine‑emitted block IDs are deterministic. diff --git a/docs/index.rst b/docs/index.rst index a90795ce053..e177115214c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -50,12 +50,12 @@ Quickstart :hidden: :caption: Kubernetes Deployment - Quickstart (K8s) <../guides/dynamo_deploy/README.md> - Dynamo Operator <../guides/dynamo_deploy/dynamo_operator.md> - Metrics <../guides/dynamo_deploy/metrics.md> - Logging <../guides/dynamo_deploy/logging.md> - Multinode <../guides/dynamo_deploy/multinode-deployment.md> - Minikube Setup <../guides/dynamo_deploy/minikube.md> + Quickstart (K8s) + Dynamo Operator + Metrics + Logging + Multinode + Minikube Setup .. toctree:: :hidden: @@ -65,6 +65,7 @@ Quickstart Router Planner KVBM + KV Events & Hashing .. toctree:: :hidden: diff --git a/lib/bindings/python/tests/test_kv_bindings.py b/lib/bindings/python/tests/test_kv_bindings.py index 3324ad12798..0497190b663 100644 --- a/lib/bindings/python/tests/test_kv_bindings.py +++ b/lib/bindings/python/tests/test_kv_bindings.py @@ -21,6 +21,7 @@ from dynamo.llm import ( ApproxKvIndexer, + compute_block_hash_for_seq_py, ForwardPassMetrics, KvEventPublisher, KvIndexer, @@ -282,3 +283,11 @@ async def metrics_publisher_task(kv_listener, expected_metrics): # test can discover them. metrics_publisher.publish(metrics) await metrics_publisher.create_endpoint(kv_listener) + + +def test_block_hash_ref_vector(): + # Reference test vector check: tokens [1,2,3,4], kv_block_size=4 + tokens = [1, 2, 3, 4] + out = compute_block_hash_for_seq_py(tokens, 4) + assert isinstance(out, list) and len(out) == 1 + assert out[0] == 14643705804678351452 diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs index c8264d90c6f..f66b60db4d2 100644 --- a/lib/llm/src/kv_router/indexer.rs +++ b/lib/llm/src/kv_router/indexer.rs @@ -335,12 +335,24 @@ impl RadixTree { let mut current = match current { Some(current) => current.clone(), None => { - tracing::warn!( - worker_id = worker_id.to_string(), - id, - parent_hash = ?op.parent_hash, - "Failed to find parent block; skipping store operation" - ); + let enforce = std::env::var("DYN_KV_ENFORCE_ENGINE_HASH_STABILITY") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + if enforce { + tracing::error!( + worker_id, + id, + parent_hash = ?op.parent_hash, + "Missing parent block; skipping store. Likely inconsistent hashing across processes. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", + ); + } else { + tracing::warn!( + worker_id, + id, + parent_hash = ?op.parent_hash, + "Missing parent block; skipping store. Set DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1 to log as error. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", + ); + } return; } }; @@ -389,11 +401,22 @@ impl RadixTree { let entry = match worker_lookup.get(&block) { Some(entry) => entry.clone(), None => { - tracing::warn!( - worker_id = worker_id.to_string(), - id, - "Failed to find block to remove; skipping remove operation" - ); + let enforce = std::env::var("DYN_KV_ENFORCE_ENGINE_HASH_STABILITY") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + if enforce { + tracing::error!( + worker_id, + id, + "Missing block to remove; skipping removal. Likely inconsistent hashing across processes. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", + ); + } else { + tracing::warn!( + worker_id, + id, + "Missing block to remove; skipping removal. Set DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1 to log as error. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", + ); + } continue; } }; @@ -1163,6 +1186,16 @@ mod tests { } } + #[test] + fn test_block_hash_ref_vector() { + // Reference test vector check: tokens [1,2,3,4], kv_block_size=4 + // Should equal the known xxh3-64(seed=1337) value below. + let tokens: Vec = vec![1, 2, 3, 4]; + let hashes = compute_block_hash_for_seq(&tokens, 4); + assert_eq!(hashes.len(), 1); + assert_eq!(hashes[0].0, 14643705804678351452u64); + } + fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec) -> RouterEvent { RouterEvent { worker_id, From 385040d9c4ed17df68390394e411aa1e432ff111 Mon Sep 17 00:00:00 2001 From: Chi McIsaac Date: Wed, 10 Sep 2025 01:44:39 -0400 Subject: [PATCH 02/11] fix import order Signed-off-by: Chi McIsaac --- lib/bindings/python/tests/test_kv_bindings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bindings/python/tests/test_kv_bindings.py b/lib/bindings/python/tests/test_kv_bindings.py index 0497190b663..54f48ffeb46 100644 --- a/lib/bindings/python/tests/test_kv_bindings.py +++ b/lib/bindings/python/tests/test_kv_bindings.py @@ -21,7 +21,6 @@ from dynamo.llm import ( ApproxKvIndexer, - compute_block_hash_for_seq_py, ForwardPassMetrics, KvEventPublisher, KvIndexer, @@ -30,6 +29,7 @@ RadixTree, WorkerMetricsPublisher, WorkerStats, + compute_block_hash_for_seq_py, ) from dynamo.runtime import Component, DistributedRuntime From c232989ebbc3701a0950d94bb0b03671498512ac Mon Sep 17 00:00:00 2001 From: Chi McIsaac Date: Wed, 10 Sep 2025 18:46:57 -0400 Subject: [PATCH 03/11] remove guide and uneccessary changes, added docstrings Signed-off-by: Chi McIsaac --- components/backends/sglang/README.md | 5 +- components/backends/trtllm/README.md | 9 +- components/backends/vllm/README.md | 7 +- .../backends/vllm/src/dynamo/vllm/args.py | 20 ++-- docs/architecture/kv_cache_routing.md | 4 + docs/guides/kv_events_hashing.md | 66 ------------ docs/index.rst | 1 - lib/bindings/python/tests/test_kv_bindings.py | 9 -- lib/llm/src/kv_router/indexer.rs | 100 +++++------------- 9 files changed, 50 insertions(+), 171 deletions(-) delete mode 100644 docs/guides/kv_events_hashing.md diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md index 4bd03ba993d..105f33c188e 100644 --- a/components/backends/sglang/README.md +++ b/components/backends/sglang/README.md @@ -217,11 +217,10 @@ This allows a request to be migrated up to 3 times before failing. See the [Requ ## Hashing Consistency for KV Events -When emitting KV events for KV-aware routing, ensure deterministic hashing across processes to prevent radix tree mismatches in the router: +When emitting KV events for KV-aware routing, ensure event identifiers are deterministic across processes and restarts so the router can correctly apply parent links and removals: -- Set `PYTHONHASHSEED=0` for all SGLang processes. - Ensure the block IDs published in events are deterministic across ranks and restarts. -- See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for background and a reference test vector check to validate your environment. +- See the high-level notes in [KV Cache Routing](../../../docs/architecture/kv_cache_routing.md) on deterministic event IDs. ## Advanced Examples diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md index 1a02aa2639d..0c2e9129689 100644 --- a/components/backends/trtllm/README.md +++ b/components/backends/trtllm/README.md @@ -201,11 +201,12 @@ To benchmark your deployment with GenAI-Perf, see this utility script, configuri ## Hashing Consistency for KV Events -When using KV-aware routing with TensorRT-LLM, ensure deterministic event identifiers across processes and runs so the router can correctly apply parent links and removals: +When using KV-aware routing with TensorRT-LLM, ensure event identifiers are deterministic across processes and runs so the router can correctly apply parent links and removals: -- Set a stable `--random-seed` where applicable. -- Ensure the block IDs used in KV events are deterministic across ranks and restarts. -- See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for details and a reference test vector check used by the router. +- Ensure all workers run the same TRT-LLM version/build and engine configuration so block ID computation is consistent. +- Validate that the KV event block IDs are identical across ranks/restarts for the same inputs. +- Client-side sampling seeds (e.g., in benchmarking tools) do not affect KV block IDs. +- See the high-level notes in [KV Cache Routing](../../../docs/architecture/kv_cache_routing.md) on deterministic event IDs. ## Disaggregation Strategy diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md index 4ebcdfe7539..619ad475601 100644 --- a/components/backends/vllm/README.md +++ b/components/backends/vllm/README.md @@ -170,16 +170,15 @@ The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html ### Hashing Consistency for KV Events -When using KV-aware routing, ensure deterministic hashing across processes to avoid radix tree mismatches: +When using KV-aware routing, ensure deterministic hashing across processes to avoid radix tree mismatches. Choose one of the following: -- Set `PYTHONHASHSEED=0` for all vLLM processes. +- Set `PYTHONHASHSEED=0` for all vLLM processes when relying on Python's builtin hashing for prefix caching. - If your vLLM version supports it, configure a deterministic prefix caching algorithm, for example: ```bash vllm serve ... --enable-prefix-caching --prefix-caching-algo sha256 ``` - -The router uses a canonical xxh3-64 hash (seed=1337) for matching local blocks. See [KV Events & Hashing](../../../docs/guides/kv_events_hashing.md) for details and a reference test vector check. +See the high-level notes in [KV Cache Routing](../../../docs/architecture/kv_cache_routing.md) on deterministic event IDs. ## Request Migration diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py index a131c6da556..a6fd6eb0e4b 100644 --- a/components/backends/vllm/src/dynamo/vllm/args.py +++ b/components/backends/vllm/src/dynamo/vllm/args.py @@ -133,17 +133,17 @@ def parse_args() -> Config: ) engine_args.enable_prefix_caching = True - # Recommend deterministic hashing setup when prefix caching is enabled + # Recommend deterministic hashing setup only when prefix caching is enabled + # and no explicit deterministic prefix-caching algorithm is configured. if engine_args.enable_prefix_caching: - phs = os.environ.get("PYTHONHASHSEED") - if not phs or phs == "random": - logger.warning( - "PYTHONHASHSEED is unset or random. For deterministic KV block IDs across processes, set PYTHONHASHSEED=0." - ) - # Best-effort guidance on vLLM hashing mode - logger.info( - "If your vLLM version supports it, consider setting --prefix-caching-algo sha256 to use a deterministic prefix hashing implementation." - ) + algo = getattr(engine_args, "prefix_caching_algo", None) + if not algo: + phs = os.environ.get("PYTHONHASHSEED") + if not phs or phs == "random": + logger.warning( + "Prefix caching enabled with no explicit prefix-caching algorithm. " + "Set PYTHONHASHSEED=0 for stable Python hashing, or configure --prefix-caching-algo (e.g., sha256) if supported." + ) config = Config() config.model = args.model diff --git a/docs/architecture/kv_cache_routing.md b/docs/architecture/kv_cache_routing.md index da4ed9bf034..aef5cf1f44f 100644 --- a/docs/architecture/kv_cache_routing.md +++ b/docs/architecture/kv_cache_routing.md @@ -203,6 +203,10 @@ The two types of events are: The publisher can be initialized and used through C bindings or Python bindings. +### Deterministic Event IDs + +For KV-aware routing to work across multiple workers and restarts, engines must emit deterministic block identifiers in KV events. Ensure all workers use identical engine versions/configuration so that block IDs for the same token content remain consistent. If your engine relies on Python's builtin `hash()` for any event IDs, set `PYTHONHASHSEED=0`; otherwise this setting has no effect. The router recomputes local block hashes from tokens for matching, but parent/child links and removals depend on engine-provided IDs being stable. + ### KVIndexer The KVIndexer builds and maintains a global view of cached blocks in a prefix tree. We modify the original prefix tree by also storing the worker id on each node. This is so we can return the number of matched blocks for each worker. diff --git a/docs/guides/kv_events_hashing.md b/docs/guides/kv_events_hashing.md deleted file mode 100644 index 9b10085db93..00000000000 --- a/docs/guides/kv_events_hashing.md +++ /dev/null @@ -1,66 +0,0 @@ - - -# KV Events & Hashing Consistency - -This guide explains how Dynamo computes and consumes KV cache block hashes, and how to ensure consistent hashing across engines, processes, and nodes. - -## Canonical Hashing (Router) - -- Algorithm: xxh3-64 -- Seed: 1337 -- Token encoding: u32 tokens serialized via little-endian `to_le_bytes` -- Scope: Computes "local block hashes" used by the router/indexer to match cached prefixes. - -Reference implementations: -- Rust (primary): `lib/llm/src/kv_router/indexer.rs` (`compute_block_hash_for_seq`) -- Python binding: `dynamo._core.compute_block_hash_for_seq_py` (delegates to the Rust implementation) - -Note: -- `kv_block_size` must be identical between the engine that publishes KV events and the router. A mismatch will yield different local block hashes and break prefix matching. - -Reference test vector check: -- Tokens `[1,2,3,4]`, `kv_block_size=4` → `14643705804678351452` - -## Engine Block IDs vs Router Hashes - -- LocalBlockHash (router): Canonical value used for KV matching. -- ExternalSequenceBlockHash (engine): Engine-provided block identifiers to link parent/child and removals; MUST be deterministic within a deployment. - -The router recomputes LocalBlockHash from tokens on ingest. If parent links or removals reference unknown ExternalSequenceBlockHash, the router logs a warning (or error if `DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1`). - -## Engine Configuration Tips - -The goal is to ensure that emitted KV events are deterministic across ranks/restarts. - -General: -- Set `PYTHONHASHSEED=0` for Python processes to eliminate hash randomization. - -vLLM: -- If your version supports it, set a deterministic prefix-caching algorithm, e.g. `--prefix-caching-algo sha256`. -- Keep `enable_prefix_caching=True` when emitting KV events. - -SGLang: -- Ensure events use deterministic block IDs across processes. If applicable, set `PYTHONHASHSEED=0`. - -TensorRT-LLM: -- Use a stable `--random-seed` where applicable and validate that KV event block IDs are deterministic across launches. - -## Observability and Enforcement - -- Warnings on router when parent link is missing or a removal refers to an unknown block id include remediation hints. -- Set `DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1` to promote these warnings to error-level logs. This does not abort processing; the router still skips the offending operation. - -## Quick Self-Check - -From Python: - -```python -from dynamo._core import compute_block_hash_for_seq_py -assert compute_block_hash_for_seq_py([1,2,3,4], 4)[0] == 14643705804678351452 -``` - -If this check fails across nodes, verify environment and engine flags per above. -This self‑check only validates the router’s canonical hashing path (known‑answer test); it does not validate that engine‑emitted block IDs are deterministic. diff --git a/docs/index.rst b/docs/index.rst index e177115214c..3a94e9a09a7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -65,7 +65,6 @@ Quickstart Router Planner KVBM - KV Events & Hashing .. toctree:: :hidden: diff --git a/lib/bindings/python/tests/test_kv_bindings.py b/lib/bindings/python/tests/test_kv_bindings.py index 54f48ffeb46..3324ad12798 100644 --- a/lib/bindings/python/tests/test_kv_bindings.py +++ b/lib/bindings/python/tests/test_kv_bindings.py @@ -29,7 +29,6 @@ RadixTree, WorkerMetricsPublisher, WorkerStats, - compute_block_hash_for_seq_py, ) from dynamo.runtime import Component, DistributedRuntime @@ -283,11 +282,3 @@ async def metrics_publisher_task(kv_listener, expected_metrics): # test can discover them. metrics_publisher.publish(metrics) await metrics_publisher.create_endpoint(kv_listener) - - -def test_block_hash_ref_vector(): - # Reference test vector check: tokens [1,2,3,4], kv_block_size=4 - tokens = [1, 2, 3, 4] - out = compute_block_hash_for_seq_py(tokens, 4) - assert isinstance(out, list) and len(out) == 1 - assert out[0] == 14643705804678351452 diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs index f66b60db4d2..1dc9eaace24 100644 --- a/lib/llm/src/kv_router/indexer.rs +++ b/lib/llm/src/kv_router/indexer.rs @@ -60,6 +60,9 @@ use tokio::sync::{broadcast, mpsc, oneshot}; use tokio_util::sync::CancellationToken; use xxhash_rust::xxh3; +/// Canonical seed used by the router for xxh3-64 local block/sequence hashes. +/// Changing it invalidates persisted local hashes (e.g., snapshots). Internal only; +/// engines must emit their own deterministic external IDs via KV events. pub const XXH3_SEED: u64 = 1337; use crate::kv_router::protocols::*; @@ -84,19 +87,15 @@ pub type WorkerId = i64; /// A shared reference to a [`RadixBlock`]. type SharedRadixBlock = Rc>; +/// xxh3-64 with seed [`XXH3_SEED`]. +#[inline] pub fn compute_hash(data: &[u8]) -> u64 { xxh3::xxh3_64_with_seed(data, XXH3_SEED) } -/// Compute the hash of a local block. -/// -/// ### Arguments -/// -/// * `data` - A byte slice representing the data to hash. -/// -/// ### Returns -/// -/// A `LocalBlockHash` representing the computed hash. +/// Hash one local block (xxh3-64, seed [`XXH3_SEED`]); caller must serialize +/// integers as little-endian. Returns a [`LocalBlockHash`] used only for +/// router-local matching; engine external IDs are separate. pub fn compute_block_hash(data: &[u8]) -> LocalBlockHash { LocalBlockHash(compute_hash(data)) } @@ -111,15 +110,10 @@ pub fn compute_block_hash(data: &[u8]) -> LocalBlockHash { // let hash = xxh3::xxh3_64_with_seed(&bytes, XXH3_SEED); // } -/// Compute the hash for a sequence of tokens. -/// -/// ### Arguments -/// -/// * `tokens` - A vector of `u32` tokens. -/// -/// ### Returns -/// -/// A vector of `LocalBlockHash` representing the computed hashes for each chunk of tokens. +/// Compute local block hashes from tokens by: +/// 1) splitting into `kv_block_size` chunks, 2) LE-serializing `u32` tokens, +/// 3) hashing each full chunk with xxh3-64 + [`XXH3_SEED`]. Trailing partial +/// chunk is ignored. Deterministic for identical inputs. pub fn compute_block_hash_for_seq(tokens: &[u32], kv_block_size: u32) -> Vec { tokens .chunks_exact(kv_block_size as usize) // Split into chunks of kv_block_size elements @@ -134,19 +128,9 @@ pub fn compute_block_hash_for_seq(tokens: &[u32], kv_block_size: u32) -> Vec Vec { if block_hashes.is_empty() { return Vec::new(); @@ -335,24 +319,12 @@ impl RadixTree { let mut current = match current { Some(current) => current.clone(), None => { - let enforce = std::env::var("DYN_KV_ENFORCE_ENGINE_HASH_STABILITY") - .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) - .unwrap_or(false); - if enforce { - tracing::error!( - worker_id, - id, - parent_hash = ?op.parent_hash, - "Missing parent block; skipping store. Likely inconsistent hashing across processes. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", - ); - } else { - tracing::warn!( - worker_id, - id, - parent_hash = ?op.parent_hash, - "Missing parent block; skipping store. Set DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1 to log as error. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", - ); - } + tracing::warn!( + worker_id = worker_id.to_string(), + id, + parent_hash = ?op.parent_hash, + "Failed to find parent block; skipping store operation" + ); return; } }; @@ -401,22 +373,11 @@ impl RadixTree { let entry = match worker_lookup.get(&block) { Some(entry) => entry.clone(), None => { - let enforce = std::env::var("DYN_KV_ENFORCE_ENGINE_HASH_STABILITY") - .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) - .unwrap_or(false); - if enforce { - tracing::error!( - worker_id, - id, - "Missing block to remove; skipping removal. Likely inconsistent hashing across processes. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", - ); - } else { - tracing::warn!( - worker_id, - id, - "Missing block to remove; skipping removal. Set DYN_KV_ENFORCE_ENGINE_HASH_STABILITY=1 to log as error. Hint: set PYTHONHASHSEED=0 and use deterministic engine settings (e.g., vLLM --prefix-caching-algo sha256).", - ); - } + tracing::warn!( + worker_id = worker_id.to_string(), + id, + "Failed to find block to remove; skipping remove operation" + ); continue; } }; @@ -1186,15 +1147,6 @@ mod tests { } } - #[test] - fn test_block_hash_ref_vector() { - // Reference test vector check: tokens [1,2,3,4], kv_block_size=4 - // Should equal the known xxh3-64(seed=1337) value below. - let tokens: Vec = vec![1, 2, 3, 4]; - let hashes = compute_block_hash_for_seq(&tokens, 4); - assert_eq!(hashes.len(), 1); - assert_eq!(hashes[0].0, 14643705804678351452u64); - } fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec) -> RouterEvent { RouterEvent { From e0d98066ada9bc510f7ea36ae33165a826dc2a61 Mon Sep 17 00:00:00 2001 From: Chi McIsaac Date: Wed, 10 Sep 2025 18:53:30 -0400 Subject: [PATCH 04/11] revert index.rst Signed-off-by: Chi McIsaac --- docs/index.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 3a94e9a09a7..a90795ce053 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -50,12 +50,12 @@ Quickstart :hidden: :caption: Kubernetes Deployment - Quickstart (K8s) - Dynamo Operator - Metrics - Logging - Multinode - Minikube Setup + Quickstart (K8s) <../guides/dynamo_deploy/README.md> + Dynamo Operator <../guides/dynamo_deploy/dynamo_operator.md> + Metrics <../guides/dynamo_deploy/metrics.md> + Logging <../guides/dynamo_deploy/logging.md> + Multinode <../guides/dynamo_deploy/multinode-deployment.md> + Minikube Setup <../guides/dynamo_deploy/minikube.md> .. toctree:: :hidden: From ff5110288d457a512fdf45fdc86a43609509efb7 Mon Sep 17 00:00:00 2001 From: Chi McIsaac Date: Wed, 10 Sep 2025 19:12:49 -0400 Subject: [PATCH 05/11] fix cargo fmt issue Signed-off-by: Chi McIsaac --- lib/llm/src/kv_router/indexer.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs index f276b7b0ebb..c491aef7d22 100644 --- a/lib/llm/src/kv_router/indexer.rs +++ b/lib/llm/src/kv_router/indexer.rs @@ -1272,8 +1272,6 @@ mod tests { }, } } - - fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec) -> RouterEvent { RouterEvent { worker_id, From b7a7a2a8e39c61c9408a8f501c135a6db706b7e2 Mon Sep 17 00:00:00 2001 From: Chi McIsaac Date: Wed, 10 Sep 2025 19:26:49 -0400 Subject: [PATCH 06/11] fix clippy error Signed-off-by: Chi McIsaac --- lib/llm/src/kv_router/indexer.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs index c491aef7d22..fe6a7883688 100644 --- a/lib/llm/src/kv_router/indexer.rs +++ b/lib/llm/src/kv_router/indexer.rs @@ -124,10 +124,12 @@ pub fn compute_block_hash(data: &[u8]) -> LocalBlockHash { // let hash = xxh3::xxh3_64_with_seed(&bytes, XXH3_SEED); // } -/// Compute local block hashes from tokens by: -/// 1) splitting into `kv_block_size` chunks, 2) LE-serializing `u32` tokens, -/// 3) hashing each full chunk with xxh3-64 + [`XXH3_SEED`]. Trailing partial -/// chunk is ignored. Deterministic for identical inputs. +/// Compute local block hashes from tokens. +/// - Split into `kv_block_size` chunks. +/// - LE-serialize `u32` tokens per chunk. +/// - Hash each full chunk with xxh3-64 + [`XXH3_SEED`]; trailing partial chunk is ignored. +/// +/// Deterministic for identical inputs. pub fn compute_block_hash_for_seq(tokens: &[u32], kv_block_size: u32) -> Vec { tokens .chunks_exact(kv_block_size as usize) // Split into chunks of kv_block_size elements From 557f06ff6b728fa8cff9320a0a2234783834be64 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Thu, 18 Sep 2025 12:34:47 -0700 Subject: [PATCH 07/11] revert indexer.rs Signed-off-by: PeaBrane --- lib/llm/src/kv_router/indexer.rs | 49 +++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs index 451d64e2280..194d0d693f0 100644 --- a/lib/llm/src/kv_router/indexer.rs +++ b/lib/llm/src/kv_router/indexer.rs @@ -52,9 +52,6 @@ use tokio::sync::{broadcast, mpsc, oneshot}; use tokio_util::sync::CancellationToken; use xxhash_rust::xxh3; -/// Canonical seed used by the router for xxh3-64 local block/sequence hashes. -/// Changing it invalidates persisted local hashes (e.g., snapshots). Internal only; -/// engines must emit their own deterministic external IDs via KV events. pub const XXH3_SEED: u64 = 1337; use crate::kv_router::protocols::*; @@ -89,15 +86,19 @@ pub type WorkerId = i64; /// A shared reference to a [`RadixBlock`]. type SharedRadixBlock = Rc>; -/// xxh3-64 with seed [`XXH3_SEED`]. -#[inline] pub fn compute_hash(data: &[u8]) -> u64 { xxh3::xxh3_64_with_seed(data, XXH3_SEED) } -/// Hash one local block (xxh3-64, seed [`XXH3_SEED`]); caller must serialize -/// integers as little-endian. Returns a [`LocalBlockHash`] used only for -/// router-local matching; engine external IDs are separate. +/// Compute the hash of a local block. +/// +/// ### Arguments +/// +/// * `data` - A byte slice representing the data to hash. +/// +/// ### Returns +/// +/// A `LocalBlockHash` representing the computed hash. pub fn compute_block_hash(data: &[u8]) -> LocalBlockHash { LocalBlockHash(compute_hash(data)) } @@ -112,12 +113,15 @@ pub fn compute_block_hash(data: &[u8]) -> LocalBlockHash { // let hash = xxh3::xxh3_64_with_seed(&bytes, XXH3_SEED); // } -/// Compute local block hashes from tokens. -/// - Split into `kv_block_size` chunks. -/// - LE-serialize `u32` tokens per chunk. -/// - Hash each full chunk with xxh3-64 + [`XXH3_SEED`]; trailing partial chunk is ignored. +/// Compute the hash for a sequence of tokens. +/// +/// ### Arguments +/// +/// * `tokens` - A vector of `u32` tokens. +/// +/// ### Returns /// -/// Deterministic for identical inputs. +/// A vector of `LocalBlockHash` representing the computed hashes for each chunk of tokens. pub fn compute_block_hash_for_seq(tokens: &[u32], kv_block_size: u32) -> Vec { tokens .chunks_exact(kv_block_size as usize) // Split into chunks of kv_block_size elements @@ -132,9 +136,19 @@ pub fn compute_block_hash_for_seq(tokens: &[u32], kv_block_size: u32) -> Vec Vec { if block_hashes.is_empty() { return Vec::new(); @@ -1271,6 +1285,7 @@ mod tests { }, } } + fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec) -> RouterEvent { RouterEvent { worker_id, @@ -2143,4 +2158,4 @@ mod tests { 1 ); } -} +} \ No newline at end of file From da3c230a45eed43e10e16f68b602bdd9b74cff33 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Thu, 18 Sep 2025 12:36:08 -0700 Subject: [PATCH 08/11] extra empty line at end Signed-off-by: PeaBrane --- lib/llm/src/kv_router/indexer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/llm/src/kv_router/indexer.rs b/lib/llm/src/kv_router/indexer.rs index 194d0d693f0..10ff3d3b3eb 100644 --- a/lib/llm/src/kv_router/indexer.rs +++ b/lib/llm/src/kv_router/indexer.rs @@ -2158,4 +2158,4 @@ mod tests { 1 ); } -} \ No newline at end of file +} From 34b97603b45ba37245ff512e99a681b849b7cdab Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Thu, 18 Sep 2025 12:38:07 -0700 Subject: [PATCH 09/11] revert args.py Signed-off-by: PeaBrane --- components/backends/vllm/src/dynamo/vllm/args.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py index a6fd6eb0e4b..f8b12d5340f 100644 --- a/components/backends/vllm/src/dynamo/vllm/args.py +++ b/components/backends/vllm/src/dynamo/vllm/args.py @@ -133,18 +133,6 @@ def parse_args() -> Config: ) engine_args.enable_prefix_caching = True - # Recommend deterministic hashing setup only when prefix caching is enabled - # and no explicit deterministic prefix-caching algorithm is configured. - if engine_args.enable_prefix_caching: - algo = getattr(engine_args, "prefix_caching_algo", None) - if not algo: - phs = os.environ.get("PYTHONHASHSEED") - if not phs or phs == "random": - logger.warning( - "Prefix caching enabled with no explicit prefix-caching algorithm. " - "Set PYTHONHASHSEED=0 for stable Python hashing, or configure --prefix-caching-algo (e.g., sha256) if supported." - ) - config = Config() config.model = args.model if args.served_model_name: From 1eac768f28539086f56dc046344c9c6cc7fc072b Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Thu, 18 Sep 2025 12:40:17 -0700 Subject: [PATCH 10/11] only keep hashing notes for vllm Signed-off-by: PeaBrane --- components/backends/sglang/README.md | 7 ------- components/backends/trtllm/README.md | 9 --------- 2 files changed, 16 deletions(-) diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md index 105f33c188e..7fa1693c3de 100644 --- a/components/backends/sglang/README.md +++ b/components/backends/sglang/README.md @@ -215,13 +215,6 @@ python3 -m dynamo.sglang ... --migration-limit=3 This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/architecture/request_migration.md) documentation for details on how this works. -## Hashing Consistency for KV Events - -When emitting KV events for KV-aware routing, ensure event identifiers are deterministic across processes and restarts so the router can correctly apply parent links and removals: - -- Ensure the block IDs published in events are deterministic across ranks and restarts. -- See the high-level notes in [KV Cache Routing](../../../docs/architecture/kv_cache_routing.md) on deterministic event IDs. - ## Advanced Examples Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md index e1eab55a9ea..3d0b685570e 100644 --- a/components/backends/trtllm/README.md +++ b/components/backends/trtllm/README.md @@ -199,15 +199,6 @@ NOTE: To send a request to a multi-node deployment, target the node which is run To benchmark your deployment with GenAI-Perf, see this utility script, configuring the `model` name and `host` based on your deployment: [perf.sh](../../../benchmarks/llm/perf.sh) -## Hashing Consistency for KV Events - -When using KV-aware routing with TensorRT-LLM, ensure event identifiers are deterministic across processes and runs so the router can correctly apply parent links and removals: - -- Ensure all workers run the same TRT-LLM version/build and engine configuration so block ID computation is consistent. -- Validate that the KV event block IDs are identical across ranks/restarts for the same inputs. -- Client-side sampling seeds (e.g., in benchmarking tools) do not affect KV block IDs. -- See the high-level notes in [KV Cache Routing](../../../docs/architecture/kv_cache_routing.md) on deterministic event IDs. - ## Disaggregation Strategy From 52e76be4874a2abb6ed57ee3379be705fc6e560a Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Thu, 18 Sep 2025 12:45:21 -0700 Subject: [PATCH 11/11] use fixed seed for router benchmarking Signed-off-by: PeaBrane --- benchmarks/router/run_engines.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/router/run_engines.sh b/benchmarks/router/run_engines.sh index 8d7b5f8f930..42dea02fd88 100755 --- a/benchmarks/router/run_engines.sh +++ b/benchmarks/router/run_engines.sh @@ -125,8 +125,8 @@ for i in $(seq 1 $NUM_WORKERS); do "${EXTRA_ARGS[@]}" else echo "[Worker-$i] Using GPUs: $GPU_DEVICES" - # Run vLLM engine (exec with env for proper syntax) - exec env CUDA_VISIBLE_DEVICES=$GPU_DEVICES python -m dynamo.vllm \ + # Run vLLM engine with PYTHONHASHSEED=0 for deterministic event IDs in KV-aware routing + exec env PYTHONHASHSEED=0 CUDA_VISIBLE_DEVICES=$GPU_DEVICES python -m dynamo.vllm \ --model "$MODEL_PATH" \ --endpoint dyn://test.vllm.generate \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \