Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
558482d
feat: initial benchmarking wrapper in-cluster work
hhzhang16 Sep 8, 2025
7cc6edb
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dyn-9…
hhzhang16 Sep 18, 2025
5a09233
feat: update benchmark job for in-cluster benchmarking following late…
hhzhang16 Sep 18, 2025
8f19a4d
feat: update in-cluster benchmark job and yaml
hhzhang16 Sep 19, 2025
3ff6675
feat: enhance GPT OSS frontend with improved harmony tool calling par…
zhongdaor-nv Sep 18, 2025
9482320
feat(operator): mechanism for disabling imagePullSecrets discovery (#…
tmonty12 Sep 18, 2025
f7cc9e9
refactor: simplify Dockerfile.vllm, enable local-dev for all framewor…
keivenchang Sep 19, 2025
d5f0495
feat: Request Cancellation unary request support (#3004)
kthui Sep 19, 2025
1648836
build: update trtllm to v1.1.0rc5 to enable trtllm + KVBM integration…
richardhuo-nv Sep 19, 2025
91181f6
build: OPS-597, OPS-861 restructure TRT-LLM to follow container strat…
nv-tusharma Sep 19, 2025
89e074c
feat: Sglang canary health check (#3103)
tzulingk Sep 19, 2025
271ef47
feat: Convert message[content] from list to string. (#3067)
KrishnanPrash Sep 19, 2025
f79e57b
feat: KVBM connector : enabling vectorized copy from pinned memory to…
oandreeva-nv Sep 19, 2025
8ee077f
feat: update READMe commands
hhzhang16 Sep 19, 2025
4ac8147
feat: update READMe commands
hhzhang16 Sep 19, 2025
e7ed272
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dyn-9…
hhzhang16 Sep 19, 2025
534ba19
docs: move in-cluster benchmarking doc to the overall benchmarking do…
hhzhang16 Sep 19, 2025
0235ece
feat: minor adjustments based on self look-through and coderabbit com…
hhzhang16 Sep 19, 2025
b392205
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dyn-9…
hhzhang16 Sep 22, 2025
ef92388
docs: add benchmarking cross-namespace
hhzhang16 Sep 22, 2025
69bcfa8
docs: have user modify benchmark job instead of using envsubst
hhzhang16 Sep 22, 2025
e83590b
docs: add tldr
hhzhang16 Sep 22, 2025
efd16d6
docs: minor doc updates
hhzhang16 Sep 22, 2025
ae9e70e
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dyn-9…
hhzhang16 Sep 22, 2025
5131348
docs: update k8s-related stuff in benchmarking.md
hhzhang16 Sep 23, 2025
38955ef
Merge branch 'main' into hannahz/dyn-973-allow-in-cluster-perf-benchm…
hhzhang16 Sep 23, 2025
a5e5b18
docs: updating client-side prereqs
hhzhang16 Sep 23, 2025
de853cf
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dyn-9…
hhzhang16 Sep 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: KVBM connector : enabling vectorized copy from pinned memory to…
… device memory and vice versa (#2989)

Signed-off-by: Olga Andreeva <[email protected]>
Signed-off-by: oandreeva-nv <[email protected]>
Co-authored-by: Ziqi Fan <[email protected]>
Co-authored-by: oandreeva-nv <[email protected]>
  • Loading branch information
3 people authored and hhzhang16 committed Sep 19, 2025
commit f79e57b1ad949b143d0c1489ba76d179e2f25e19
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*.[Pp][Nn][Gg] binary
*.[Zz][Ii][Pp] binary
*.[Tt][Gg][Zz] binary
*.fatbin binary

# Exclude test data files from linguist language detection
lib/llm/tests/data/** linguist-vendored
Expand Down
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ impl Leader for KvConnectorLeader {
);

if slot.state() == SlotState::SkippedPrefill || slot.state() == SlotState::SkippedDecode {
tracing::warn!("slot is in the SkippedPrefill or SkippedDecode state; will resume from skipped and return early");
tracing::debug!("slot is in the SkippedPrefill or SkippedDecode state; will resume from skipped and return early");
match slot.state() {
SlotState::SkippedPrefill => {
slot.mark_as_prefilling(self.iteration_counter)?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ impl VllmConnectorSlot {
SlotState::SkippedPrefill => Ok(()), // already skipped
SlotState::SkippedDecode => Ok(()), // already skipped
_ => {
tracing::warn!("slot is in the {:?} state; will not explicitly mark as skipped, request_id: {}", self.state, self.request_id);
tracing::debug!("slot is in the {:?} state; will not explicitly mark as skipped, request_id: {}", self.state, self.request_id);
Ok(())
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
Args: kv_caches:
dictionary of layer names, kv cache
"""
print(
f"KvConnectorWorker.register_kv_caches called with {len(kv_caches)} kv_caches"
)

cache_config = self.vllm_config.cache_config

# Create ordered list of (layer_name, tensor) tuples sorted by layer index
Expand Down
76 changes: 75 additions & 1 deletion lib/llm/build.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,90 @@
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::env;
use std::path::PathBuf;

fn main() -> Result<(), Box<dyn std::error::Error>> {
// Declare our custom cfg flag to avoid unexpected_cfgs warnings
println!("cargo:rustc-check-cfg=cfg(have_vec_copy_fatbin)");

println!("cargo:warning=Building with CUDA KV off");
build_protos()
build_protos()?;

// Get FATBIN path and copy it to OUT_DIR for embedding
if let Some(fatbin_path) = find_fatbin_file() {
// Copy FATBIN to OUT_DIR so we can include it with a predictable path
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = PathBuf::from(out_dir).join("vectorized_copy.fatbin");

if let Err(e) = std::fs::copy(&fatbin_path, &dest_path) {
println!("cargo:warning=Failed to copy FATBIN to OUT_DIR: {}", e);
} else {
// Emit cfg flag for conditional compilation
println!("cargo:rustc-cfg=have_vec_copy_fatbin");
println!(
"cargo:warning=CUDA FATBIN found at: {} - copied to OUT_DIR",
fatbin_path.display()
);
}

// Tell cargo to rerun if FATBIN file changes
println!("cargo:rerun-if-changed={}", fatbin_path.display());
} else {
println!(
"cargo:warning=CUDA FATBIN not found - run 'make fatbin' in cuda_kernels directory"
);
println!("cargo:warning=Set DYNAMO_FATBIN_PATH env var to specify custom location");
}

// Rerun build if environment variable changes
println!("cargo:rerun-if-env-changed=DYNAMO_FATBIN_PATH");

Ok(())
}

fn build_protos() -> Result<(), Box<dyn std::error::Error>> {
tonic_build::compile_protos("src/grpc/protos/kserve.proto")?;
Ok(())
}

fn find_fatbin_file() -> Option<PathBuf> {
// 1. Check if user specified custom path via environment variable
if let Ok(custom_path) = env::var("DYNAMO_FATBIN_PATH") {
let fatbin_file = PathBuf::from(custom_path);
if fatbin_file.exists() {
println!(
"cargo:warning=Using custom FATBIN path: {}",
fatbin_file.display()
);
return Some(fatbin_file);
} else {
println!(
"cargo:warning=Custom FATBIN path does not exist: {}",
fatbin_file.display()
);
}
}

// 2. Check standard locations (priority order)
let default_paths = [
"./src/block_manager/block/transfer/kernels/vectorized_copy.fatbin", // Primary: Next to transfer module
];

for path in &default_paths {
let fatbin_file = PathBuf::from(path);
if fatbin_file.exists() {
println!(
"cargo:warning=Found FATBIN at default location: {}",
fatbin_file.display()
);
return Some(fatbin_file);
}
}

None
}

// NOTE: Preserving this build.rs for reference. We may want to re-enable
// custom kernel compilation in the future.

Expand Down
88 changes: 80 additions & 8 deletions lib/llm/src/block_manager/block/transfer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,14 @@ use crate::block_manager::storage::{
nixl::{NixlRegisterableStorage, NixlStorage},
};

use cudarc::driver::CudaStream;

use nixl_sys::NixlDescriptor;
use nixl_sys::XferOp::{Read, Write};
use std::ops::Range;
use tokio::sync::oneshot;

pub use crate::block_manager::storage::{CudaAccessible, Local, Remote};
pub use async_trait::async_trait;
pub use context::TransferContext;
pub use context::{PoolConfig, TransferContext};

/// A block that can be the target of a write
pub trait Writable {}
Expand Down Expand Up @@ -82,6 +80,14 @@ impl NixlTransfer {
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudaTransferMode {
/// Use the custom CUDA kernel for G1 <-> G2 transfers
Custom,
/// Use the default CUDA async memcpy for G1 <-> G2 transfers
Default,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TransferStrategy {
Memcpy,
Expand Down Expand Up @@ -135,6 +141,33 @@ where
}
}

#[inline]
fn resolve_cuda_transfer_mode(
base_strategy: TransferStrategy,
is_contiguous: bool,
) -> CudaTransferMode {
match base_strategy {
TransferStrategy::CudaAsyncH2D => {
if is_contiguous {
CudaTransferMode::Default
} else {
CudaTransferMode::Custom
}
}
TransferStrategy::CudaAsyncD2H => {
if is_contiguous {
CudaTransferMode::Default
} else {
CudaTransferMode::Custom
}
}
other => panic!(
"resolve_cuda_strategy called with non-CUDA strategy: {:?}",
other
),
}
}

pub fn handle_local_transfer<RB, WB>(
sources: &[RB],
targets: &mut [WB],
Expand Down Expand Up @@ -162,12 +195,51 @@ where
TransferStrategy::CudaAsyncH2D
| TransferStrategy::CudaAsyncD2H
| TransferStrategy::CudaAsyncD2D => {
for (src, dst) in sources.iter().zip(targets.iter_mut()) {
cuda::copy_block(src, dst, ctx.stream().as_ref(), RB::write_to_strategy())?;
tracing::debug!(
"Transfer: Using CUDA strategy: {:?}",
RB::write_to_strategy()
);

if RB::write_to_strategy() == TransferStrategy::CudaAsyncH2D
|| RB::write_to_strategy() == TransferStrategy::CudaAsyncD2H
{
let is_contiguous = sources[0].block_data().is_fully_contiguous()
&& targets[0].block_data().is_fully_contiguous();
let transfer_mode =
resolve_cuda_transfer_mode(RB::write_to_strategy(), is_contiguous);

match transfer_mode {
CudaTransferMode::Custom => {
let selected_stream = ctx.stream();
cuda::copy_blocks_with_customized_kernel(
sources,
targets,
selected_stream.as_ref(),
&ctx,
)?;
}
CudaTransferMode::Default => {
for (src, dst) in sources.iter().zip(targets.iter_mut()) {
cuda::copy_block(
src,
dst,
ctx.stream().as_ref(),
RB::write_to_strategy(),
)?;
}
}
}
ctx.cuda_event(tx)?;

Ok(rx)
} else {
// Fall back to individual copy for D2Dblocks
for (src, dst) in sources.iter().zip(targets.iter_mut()) {
cuda::copy_block(src, dst, ctx.stream().as_ref(), RB::write_to_strategy())?;
}
ctx.cuda_event(tx)?;
Ok(rx)
}

ctx.cuda_event(tx)?;
Ok(rx)
}
TransferStrategy::Nixl(transfer_type) => {
let transfer_fut = nixl::write_blocks_to(sources, targets, &ctx, transfer_type)?;
Expand Down
Loading