feat: KVBM connector : enabling vectorized copy from pinned memory to…

… device memory and vice versa (#2989) Signed-off-by: Olga Andreeva <[email protected]> Signed-off-by: oandreeva-nv <[email protected]> Co-authored-by: Ziqi Fan <[email protected]> Co-authored-by: oandreeva-nv <[email protected]>
ai-dynamo · hhzhang16 · Sep 23, 2025 · Sep 8, 2025 · Sep 18, 2025 · Sep 18, 2025
commit f79e57b1ad949b143d0c1489ba76d179e2f25e19
diff --git a/.gitattributes b/.gitattributes
@@ -7,6 +7,7 @@
 *.[Pp][Nn][Gg]      binary
 *.[Zz][Ii][Pp]      binary
 *.[Tt][Gg][Zz]      binary
+*.fatbin            binary
 
 # Exclude test data files from linguist language detection
 lib/llm/tests/data/** linguist-vendored

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader.rs
@@ -218,7 +218,7 @@ impl Leader for KvConnectorLeader {
         );
 
         if slot.state() == SlotState::SkippedPrefill || slot.state() == SlotState::SkippedDecode {
-            tracing::warn!("slot is in the SkippedPrefill or SkippedDecode state; will resume from skipped and return early");
+            tracing::debug!("slot is in the SkippedPrefill or SkippedDecode state; will resume from skipped and return early");
             match slot.state() {
                 SlotState::SkippedPrefill => {
                     slot.mark_as_prefilling(self.iteration_counter)?;

diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
@@ -398,7 +398,7 @@ impl VllmConnectorSlot {
             SlotState::SkippedPrefill => Ok(()), // already skipped
             SlotState::SkippedDecode => Ok(()),  // already skipped
             _ => {
-                tracing::warn!("slot is in the {:?} state; will not explicitly mark as skipped, request_id: {}", self.state, self.request_id);
+                tracing::debug!("slot is in the {:?} state; will not explicitly mark as skipped, request_id: {}", self.state, self.request_id);
                 Ok(())
             }
         }

diff --git a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_worker.py b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_worker.py
@@ -64,9 +64,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         Args: kv_caches:
             dictionary of layer names, kv cache
         """
-        print(
-            f"KvConnectorWorker.register_kv_caches called with {len(kv_caches)} kv_caches"
-        )
+
         cache_config = self.vllm_config.cache_config
 
         # Create ordered list of (layer_name, tensor) tuples sorted by layer index

diff --git a/lib/llm/build.rs b/lib/llm/build.rs
@@ -1,16 +1,90 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+use std::env;
+use std::path::PathBuf;
+
 fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Declare our custom cfg flag to avoid unexpected_cfgs warnings
+    println!("cargo:rustc-check-cfg=cfg(have_vec_copy_fatbin)");
+
     println!("cargo:warning=Building with CUDA KV off");
-    build_protos()
+    build_protos()?;
+
+    // Get FATBIN path and copy it to OUT_DIR for embedding
+    if let Some(fatbin_path) = find_fatbin_file() {
+        // Copy FATBIN to OUT_DIR so we can include it with a predictable path
+        let out_dir = env::var("OUT_DIR").unwrap();
+        let dest_path = PathBuf::from(out_dir).join("vectorized_copy.fatbin");
+
+        if let Err(e) = std::fs::copy(&fatbin_path, &dest_path) {
+            println!("cargo:warning=Failed to copy FATBIN to OUT_DIR: {}", e);
+        } else {
+            // Emit cfg flag for conditional compilation
+            println!("cargo:rustc-cfg=have_vec_copy_fatbin");
+            println!(
+                "cargo:warning=CUDA FATBIN found at: {} - copied to OUT_DIR",
+                fatbin_path.display()
+            );
+        }
+
+        // Tell cargo to rerun if FATBIN file changes
+        println!("cargo:rerun-if-changed={}", fatbin_path.display());
+    } else {
+        println!(
+            "cargo:warning=CUDA FATBIN not found - run 'make fatbin' in cuda_kernels directory"
+        );
+        println!("cargo:warning=Set DYNAMO_FATBIN_PATH env var to specify custom location");
+    }
+
+    // Rerun build if environment variable changes
+    println!("cargo:rerun-if-env-changed=DYNAMO_FATBIN_PATH");
+
+    Ok(())
 }
 
 fn build_protos() -> Result<(), Box<dyn std::error::Error>> {
     tonic_build::compile_protos("src/grpc/protos/kserve.proto")?;
     Ok(())
 }
 
+fn find_fatbin_file() -> Option<PathBuf> {
+    // 1. Check if user specified custom path via environment variable
+    if let Ok(custom_path) = env::var("DYNAMO_FATBIN_PATH") {
+        let fatbin_file = PathBuf::from(custom_path);
+        if fatbin_file.exists() {
+            println!(
+                "cargo:warning=Using custom FATBIN path: {}",
+                fatbin_file.display()
+            );
+            return Some(fatbin_file);
+        } else {
+            println!(
+                "cargo:warning=Custom FATBIN path does not exist: {}",
+                fatbin_file.display()
+            );
+        }
+    }
+
+    // 2. Check standard locations (priority order)
+    let default_paths = [
+        "./src/block_manager/block/transfer/kernels/vectorized_copy.fatbin", // Primary: Next to transfer module
+    ];
+
+    for path in &default_paths {
+        let fatbin_file = PathBuf::from(path);
+        if fatbin_file.exists() {
+            println!(
+                "cargo:warning=Found FATBIN at default location: {}",
+                fatbin_file.display()
+            );
+            return Some(fatbin_file);
+        }
+    }
+
+    None
+}
+
 // NOTE: Preserving this build.rs for reference. We may want to re-enable
 // custom kernel compilation in the future.
 

diff --git a/lib/llm/src/block_manager/block/transfer.rs b/lib/llm/src/block_manager/block/transfer.rs
@@ -14,16 +14,14 @@ use crate::block_manager::storage::{
     nixl::{NixlRegisterableStorage, NixlStorage},
 };
 
-use cudarc::driver::CudaStream;
-
 use nixl_sys::NixlDescriptor;
 use nixl_sys::XferOp::{Read, Write};
 use std::ops::Range;
 use tokio::sync::oneshot;
 
 pub use crate::block_manager::storage::{CudaAccessible, Local, Remote};
 pub use async_trait::async_trait;
-pub use context::TransferContext;
+pub use context::{PoolConfig, TransferContext};
 
 /// A block that can be the target of a write
 pub trait Writable {}
@@ -82,6 +80,14 @@ impl NixlTransfer {
     }
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CudaTransferMode {
+    /// Use the custom CUDA kernel for G1 <-> G2 transfers
+    Custom,
+    /// Use the default CUDA async memcpy for G1 <-> G2 transfers
+    Default,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum TransferStrategy {
     Memcpy,
@@ -135,6 +141,33 @@ where
     }
 }
 
+#[inline]
+fn resolve_cuda_transfer_mode(
+    base_strategy: TransferStrategy,
+    is_contiguous: bool,
+) -> CudaTransferMode {
+    match base_strategy {
+        TransferStrategy::CudaAsyncH2D => {
+            if is_contiguous {
+                CudaTransferMode::Default
+            } else {
+                CudaTransferMode::Custom
+            }
+        }
+        TransferStrategy::CudaAsyncD2H => {
+            if is_contiguous {
+                CudaTransferMode::Default
+            } else {
+                CudaTransferMode::Custom
+            }
+        }
+        other => panic!(
+            "resolve_cuda_strategy called with non-CUDA strategy: {:?}",
+            other
+        ),
+    }
+}
+
 pub fn handle_local_transfer<RB, WB>(
     sources: &[RB],
     targets: &mut [WB],
@@ -162,12 +195,51 @@ where
         TransferStrategy::CudaAsyncH2D
         | TransferStrategy::CudaAsyncD2H
         | TransferStrategy::CudaAsyncD2D => {
-            for (src, dst) in sources.iter().zip(targets.iter_mut()) {
-                cuda::copy_block(src, dst, ctx.stream().as_ref(), RB::write_to_strategy())?;
+            tracing::debug!(
+                "Transfer: Using CUDA strategy: {:?}",
+                RB::write_to_strategy()
+            );
+
+            if RB::write_to_strategy() == TransferStrategy::CudaAsyncH2D
+                || RB::write_to_strategy() == TransferStrategy::CudaAsyncD2H
+            {
+                let is_contiguous = sources[0].block_data().is_fully_contiguous()
+                    && targets[0].block_data().is_fully_contiguous();
+                let transfer_mode =
+                    resolve_cuda_transfer_mode(RB::write_to_strategy(), is_contiguous);
+
+                match transfer_mode {
+                    CudaTransferMode::Custom => {
+                        let selected_stream = ctx.stream();
+                        cuda::copy_blocks_with_customized_kernel(
+                            sources,
+                            targets,
+                            selected_stream.as_ref(),
+                            &ctx,
+                        )?;
+                    }
+                    CudaTransferMode::Default => {
+                        for (src, dst) in sources.iter().zip(targets.iter_mut()) {
+                            cuda::copy_block(
+                                src,
+                                dst,
+                                ctx.stream().as_ref(),
+                                RB::write_to_strategy(),
+                            )?;
+                        }
+                    }
+                }
+                ctx.cuda_event(tx)?;
+
+                Ok(rx)
+            } else {
+                // Fall back to individual copy for D2Dblocks
+                for (src, dst) in sources.iter().zip(targets.iter_mut()) {
+                    cuda::copy_block(src, dst, ctx.stream().as_ref(), RB::write_to_strategy())?;
+                }
+                ctx.cuda_event(tx)?;
+                Ok(rx)
             }
-
-            ctx.cuda_event(tx)?;
-            Ok(rx)
         }
         TransferStrategy::Nixl(transfer_type) => {
             let transfer_fut = nixl::write_blocks_to(sources, targets, &ctx, transfer_type)?;