From 74bc13f56487a2c3ee67052e7744e8b6f916dc5e Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 27 May 2025 14:00:14 -0700 Subject: [PATCH 01/36] use BTreeSet, and allow for push_front (preemption) --- lib/llm/src/mocker/evictor.rs | 204 +++++++++++++++++----------------- 1 file changed, 103 insertions(+), 101 deletions(-) diff --git a/lib/llm/src/mocker/evictor.rs b/lib/llm/src/mocker/evictor.rs index 47a312eede..bd1f827ebe 100644 --- a/lib/llm/src/mocker/evictor.rs +++ b/lib/llm/src/mocker/evictor.rs @@ -13,60 +13,103 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::Eq; -use std::collections::{HashMap, VecDeque}; +use std::cmp::{Eq, Ordering}; +use std::collections::{BTreeSet, HashMap}; use std::hash::Hash; -use std::time::Instant; + +/// A wrapper for (T, counter) that implements Ord based only on counter +#[derive(Debug, Clone, Eq, PartialEq)] +struct PriorityItem { + item: T, + counter: i64, +} + +impl Ord for PriorityItem { + fn cmp(&self, other: &Self) -> Ordering { + self.counter.cmp(&other.counter) + } +} + +impl PartialOrd for PriorityItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} /// An LRU evictor that maintains objects and evicts them based on their -/// last accessed time. Implements a "lazy" eviction mechanism where: -/// 1. The priority queue does not immediately reflect updates or removes -/// 2. Objects are pushed to the queue in order of increasing priority (older objects first) -/// 3. The user must ensure objects are added in correct priority (temporal order) -/// 4. Remove and update operations are lazy - entries remain in the queue until -/// they are either evicted or cleaned up during maintenance +/// priority counter. Lower counter values are evicted first. #[derive(Debug)] pub struct LRUEvictor { - free_table: HashMap, - priority_queue: VecDeque<(T, f64)>, - cleanup_threshold: usize, - start_time: Instant, + free_table: HashMap, + priority_queue: BTreeSet>, + positive_counter: i64, + negative_counter: i64, } impl Default for LRUEvictor { fn default() -> Self { Self { free_table: HashMap::new(), - priority_queue: VecDeque::new(), - cleanup_threshold: 50, - start_time: Instant::now(), + priority_queue: BTreeSet::new(), + positive_counter: 0, + negative_counter: 0, } } } impl LRUEvictor { - /// Create a new LRUEvictor with the default cleanup threshold - pub fn new(cleanup_threshold: usize) -> Self { - Self { - cleanup_threshold, - ..Default::default() - } - } - - /// Get the current timestamp as seconds since initialization - pub fn current_timestamp(&self) -> f64 { - self.start_time.elapsed().as_secs_f64() + /// Create a new LRUEvictor + pub fn new(_cleanup_threshold: usize) -> Self { + // Keep the parameter for API compatibility, but ignore it + Self::default() } /// Get an iterator over the keys in the evictor - pub fn keys(&self) -> std::collections::hash_map::Keys<'_, T, f64> { + pub fn keys(&self) -> std::collections::hash_map::Keys<'_, T, i64> { self.free_table.keys() } - /// Insert or update an object in the evictor with current timestamp + /// Private helper method to update the data structures with object and counter + fn _update(&mut self, object: T, counter: i64) { + self.free_table.insert(object.clone(), counter); + self.priority_queue.insert(PriorityItem { + item: object, + counter, + }); + } + + /// Insert or update an object in the evictor with positive counter pub fn insert(&mut self, object: T) { - let timestamp = self.current_timestamp(); - self._insert(object, timestamp); + // Remove old entry if it exists + if let Some(&old_counter) = self.free_table.get(&object) { + self.priority_queue.remove(&PriorityItem { + item: object.clone(), + counter: old_counter, + }); + } + + // Increment positive counter and insert + self.positive_counter += 1; + let counter = self.positive_counter; + + self._update(object, counter); + } + + /// Push an object to the front with negative counter (highest priority for eviction) + pub fn push_front(&mut self, object: T) { + // Remove old entry if it exists + if let Some(&old_counter) = self.free_table.get(&object) { + self.priority_queue.remove(&PriorityItem { + item: object.clone(), + counter: old_counter, + }); + } + + // Decrement negative counter and insert + self.negative_counter -= 1; + let counter = self.negative_counter; + + self._update(object, counter); } /// Check if the evictor contains the given object @@ -74,39 +117,29 @@ impl LRUEvictor { self.free_table.contains_key(object) } - /// Evict an object based on LRU policy + /// Evict an object based on LRU policy (lowest counter value) /// Returns the evicted object or None if no objects are available pub fn evict(&mut self) -> Option { - if self.free_table.is_empty() { - return None; - } - - while let Some((object, last_accessed)) = self.priority_queue.pop_front() { - let Some(¤t_last_accessed) = self.free_table.get(&object) else { - continue; // entry is already removed - }; - - if current_last_accessed == last_accessed { - self.free_table.remove(&object); - return Some(object); - } // otherwise entry is stale + if let Some(item) = self.priority_queue.pop_first() { + self.free_table.remove(&item.item); + Some(item.item) + } else { + None } - - None - } - - /// Insert or update an object in the evictor - fn _insert(&mut self, object: T, last_accessed: f64) { - self.free_table.insert(object.clone(), last_accessed); - self.priority_queue.push_back((object, last_accessed)); - self.cleanup_if_necessary(); } /// Remove an object from the evictor - /// We don't remove from the priority queue immediately, as that would be inefficient - /// Outdated entries will be filtered out during eviction or cleanup pub fn remove(&mut self, object: &T) -> bool { - self.free_table.remove(object).is_some() + if let Some(&counter) = self.free_table.get(object) { + self.free_table.remove(object); + self.priority_queue.remove(&PriorityItem { + item: object.clone(), + counter, + }); + true + } else { + false + } } /// Get the number of objects in the evictor @@ -118,62 +151,31 @@ impl LRUEvictor { pub fn is_empty(&self) -> bool { self.free_table.is_empty() } - - /// Check if cleanup is necessary and perform it if needed - fn cleanup_if_necessary(&mut self) { - if self.priority_queue.len() > self.cleanup_threshold * self.free_table.len() { - self.cleanup(); - } - } - - /// Clean up the priority queue by removing outdated entries - fn cleanup(&mut self) { - let mut new_priority_queue = VecDeque::new(); - for (object, timestamp) in self.priority_queue.drain(..) { - let Some(¤t_timestamp) = self.free_table.get(&object) else { - continue; - }; - - if current_timestamp == timestamp { - new_priority_queue.push_back((object, timestamp)); - } - } - self.priority_queue = new_priority_queue; - } } #[cfg(test)] mod tests { use super::*; - use rstest::rstest; - #[rstest] - #[case(1)] - #[case(2)] - #[case(3)] - fn test_lru_evictor_eviction_order(#[case] threshold: usize) { - // Create a new LRUEvictor with the given cleanup threshold - let mut evictor = LRUEvictor::::new(threshold); + #[test] + fn test_lru_evictor_eviction_order() { + // Create a new LRUEvictor + let mut evictor = LRUEvictor::::new(1); // threshold value doesn't matter anymore - // Add items in the specified order with small delays between each + // Add items in the specified order evictor.insert(4); - std::thread::sleep(std::time::Duration::from_millis(1)); evictor.insert(3); - std::thread::sleep(std::time::Duration::from_millis(1)); evictor.insert(2); - std::thread::sleep(std::time::Duration::from_millis(1)); evictor.insert(1); - std::thread::sleep(std::time::Duration::from_millis(1)); evictor.insert(5); - std::thread::sleep(std::time::Duration::from_millis(1)); - evictor.insert(1); // Updates timestamp for 1 - std::thread::sleep(std::time::Duration::from_millis(1)); - evictor.insert(4); // Updates timestamp for 4 - std::thread::sleep(std::time::Duration::from_millis(1)); - evictor.insert(2); // Updates timestamp for 2 + evictor.insert(1); // Updates counter for 1 + evictor.insert(4); // Updates counter for 4 + evictor.insert(2); // Updates counter for 2 + evictor.push_front(4); // Verify the eviction order - println!("Testing with threshold {}", threshold); + let evicted = evictor.evict().unwrap(); + assert_eq!(evicted, 4); let evicted = evictor.evict().unwrap(); assert_eq!(evicted, 3); let evicted = evictor.evict().unwrap(); @@ -181,11 +183,11 @@ mod tests { let evicted = evictor.evict().unwrap(); assert_eq!(evicted, 1); let evicted = evictor.evict().unwrap(); - assert_eq!(evicted, 4); - let evicted = evictor.evict().unwrap(); assert_eq!(evicted, 2); let evicted = evictor.evict(); assert_eq!(evicted, None); assert_eq!(evictor.len(), 0); } + + // ... existing test_push_front test ... } From f2343d5090d5ead69f06e03b6fc2d30134b7c073 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 27 May 2025 14:18:05 -0700 Subject: [PATCH 02/36] preemption is push_front --- lib/llm/src/mocker/scheduler.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index e71647feab..d26f6ab5a2 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -174,7 +174,7 @@ impl SchedulerState { // Insert the new sequence back into the requests map and add to waiting queue self.requests.insert(uuid, Request::Active(active_sequence)); - self.waiting.push_back(uuid); + self.waiting.push_front(uuid); Some(signals) } @@ -546,7 +546,7 @@ mod tests { // Manual debug ticker that prints forward pass metrics _ = debug_interval.tick() => { let _metrics = scheduler.get_forward_pass_metrics().await; - // println!("Forward Pass Metrics: {:#?}", _metrics); + println!("Forward Pass Metrics: {:#?}", _metrics); } Some(_) = output_rx.recv() => { From 6fe3154115fcda4f49c0ab00a78b042540b1bb22 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 27 May 2025 14:58:54 -0700 Subject: [PATCH 03/36] use Hongkuan's quadratic formulas for decode and prefill --- lib/llm/src/mocker/kv_manager.rs | 7 ++++- lib/llm/src/mocker/scheduler.rs | 50 +++++++++++++++++--------------- lib/llm/src/mocker/sequence.rs | 24 ++++----------- 3 files changed, 37 insertions(+), 44 deletions(-) diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index 8a7a8fefed..caa6fda110 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -200,6 +200,11 @@ impl KvManager { self.active_blocks.len() } + /// Get the percentage of active blocks relative to maximum capacity + pub fn get_active_perc(&self) -> f64 { + self.active_blocks.len() as f64 / self.max_capacity as f64 + } + /// Get the number of inactive blocks pub fn num_inactive_blocks(&self) -> usize { self.inactive_blocks.len() @@ -261,7 +266,7 @@ impl KvManager { // Calculate prefill compute let prefill_compute = - new_tokens as f64 * (new_tokens + overlap_blocks * self.block_size) as f64; + 1.25e-6 * (new_tokens as f64).powi(2) + 7.41e-2 * (new_tokens as f64) + 2.62e1; Some(PrefillCost { new_tokens, diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index d26f6ab5a2..e01b677e7e 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -194,7 +194,7 @@ impl Scheduler { kv_capacity: usize, watermark: f64, block_size: usize, - chunk_size: Option, + speedup_ratio: Option, output_tx: Option>, cancellation_token: Option, ) -> Self { @@ -205,7 +205,16 @@ impl Scheduler { let state = Arc::new(Mutex::new(SchedulerState::default())); let kv_manager = Arc::new(Mutex::new(kv_manager)); - let chunk_size = chunk_size.unwrap_or(256); + + // Assert speedup_ratio is greater than 0 if provided + if let Some(ratio) = speedup_ratio { + assert!( + ratio > 0.0, + "speedup_ratio must be greater than 0, got: {}", + ratio + ); + } + let speedup_ratio = speedup_ratio.unwrap_or(1.0); // Create channel for request handling let (request_tx, mut request_rx) = mpsc::channel::(1024); @@ -242,7 +251,7 @@ impl Scheduler { // Process DirectRequests, converting them to ActiveSequence and scheduling them until we can't // schedule anymore. while let Some((uuid, request)) = state_guard.next() { - let active_sequence = get_active_sequence(request, block_size, chunk_size); + let active_sequence = get_active_sequence(request, block_size); // Calculate token budget using new_tokens from PrefillCost let total_prefill_tokens = state_guard.num_batched_tokens(); @@ -271,10 +280,10 @@ impl Scheduler { let mut state_guard = state_clone.lock().await; let mut kv_manager_guard = kv_manager_clone.lock().await; - // Base time needed for decoding (assumed memory bound on KV cache) - let active_tokens = kv_manager_guard.num_active_blocks() * block_size; - // TODO: 2 is a dummy / magic scaling factor - let mut generation_time = Duration::from_micros((active_tokens / 2) as u64); + // Base time needed for decoding using active percentage and quadratic formula + let active_perc = kv_manager_guard.get_active_perc(); + let decoding_time = -5.47 * active_perc.powi(2) + 43.88 * active_perc + 19.44; + let mut total_time = Duration::from_secs_f64(decoding_time / 1000.0); // Process each running request let uuids: Vec = state_guard.running.keys().cloned().collect(); @@ -285,7 +294,7 @@ impl Scheduler { } // Get prefill compute value first - let prefill_compute = state_guard.get_prefill_compute(&uuid); + let prefill_compute = state_guard.get_prefill_compute(&uuid).unwrap_or(0.); // Get the active sequence for this UUID let sequence = state_guard.requests.get_mut(&uuid) @@ -295,14 +304,6 @@ impl Scheduler { // Generate token and get signals let signals = sequence.generate(); - // Accumulate sleep duration based on prefill_compute if available - // prefill compute = (cached_tokens + new_tokens) * new_tokens - let sleep_ms = if let Some(compute) = prefill_compute { - // TODO: 1024 is a dummy / magic scaling factor - (compute / 1024.0) as u64 - } else { 0 }; - generation_time += Duration::from_micros(sleep_ms); - // Process all signals with the KvManager // Handling of preemption on failure if !process_signals(&mut kv_manager_guard, &signals) { @@ -319,8 +320,10 @@ impl Scheduler { continue; } + // Accumulate sleep duration based on prefill_compute if available + total_time += Duration::from_secs_f64(prefill_compute / 1000.0); + // Send UUID notification for each generated token - // TODO: hook this up to an AsyncEngine if let Some(tx) = &output_tx_clone { let _ = tx.try_send(uuid); } @@ -337,9 +340,10 @@ impl Scheduler { } } - // Sleep once for the accumulated duration - if generation_time.as_millis() > 0 { - tokio::time::sleep(generation_time).await; + // Sleep once for the adjusted duration + let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / speedup_ratio); + if adjusted_time.as_millis() > 0 { + tokio::time::sleep(adjusted_time).await; } } } @@ -405,7 +409,7 @@ impl Scheduler { } /// Convert a Request to an ActiveSequence -fn get_active_sequence(request: Request, block_size: usize, chunk_size: usize) -> ActiveSequence { +fn get_active_sequence(request: Request, block_size: usize) -> ActiveSequence { if let Request::Active(active_seq) = request { return active_seq; } @@ -418,7 +422,6 @@ fn get_active_sequence(request: Request, block_size: usize, chunk_size: usize) - direct_request.tokens, direct_request.max_output_tokens, Some(block_size), - Some(chunk_size), ) } @@ -475,7 +478,6 @@ mod tests { let kv_capacity: usize = 500; let watermark: f64 = 0.01; // 1% watermark let block_size: usize = 64; - let chunk_size: usize = 256; let num_requests: usize = 100; let input_len: usize = 1000; let max_output_tokens: usize = 100; @@ -488,7 +490,7 @@ mod tests { kv_capacity, watermark, block_size, - Some(chunk_size), + Some(10.0), // speedup_ratio Some(output_tx), None, ); diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index d53dd870e1..b79e79872a 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -52,9 +52,6 @@ pub struct ActiveSequence { #[getter(copy)] block_size: usize, - #[getter(copy)] - chunk_size: usize, // TODO: not actually used - #[getter(copy)] max_output_tokens: usize, @@ -69,15 +66,9 @@ pub struct ActiveSequence { impl ActiveSequence { /// Create a new ActiveSequence instance with the provided tokens - pub fn new( - tokens: Vec, - max_output_tokens: usize, - block_size: Option, - chunk_size: Option, - ) -> Self { + pub fn new(tokens: Vec, max_output_tokens: usize, block_size: Option) -> Self { let block_size = block_size.unwrap_or(64); assert!(block_size > 1, "block_size must be greater than 1"); - let chunk_size = chunk_size.unwrap_or(256); let num_input_tokens = tokens.len(); let tokens = Tokens::from(tokens).into_sequence(block_size, None); @@ -88,7 +79,6 @@ impl ActiveSequence { unique_blocks, tokens, block_size, - chunk_size, max_output_tokens, generated_tokens: 0, num_input_tokens, @@ -113,9 +103,8 @@ impl ActiveSequence { tokens: Vec, max_output_tokens: usize, block_size: Option, - chunk_size: Option, ) -> (Self, Option) { - let mut sequence = Self::new(tokens, max_output_tokens, block_size, chunk_size); + let mut sequence = Self::new(tokens, max_output_tokens, block_size); let signal = sequence.creation_signal.take(); (sequence, signal) } @@ -237,8 +226,7 @@ mod tests { fn test_active_sequence_push() { // Create a sequence with block size 16 initialized with tokens [0..15] let initial_tokens: Vec = (0..15).collect(); - let (mut seq1, signal1) = - ActiveSequence::new_with_signal(initial_tokens, 100, Some(16), Some(256)); + let (mut seq1, signal1) = ActiveSequence::new_with_signal(initial_tokens, 100, Some(16)); assert_eq!(seq1.num_input_tokens(), 15); assert_eq!(seq1.len(), 15); @@ -289,8 +277,7 @@ mod tests { // Create another sequence with block size 16 initialized with tokens [0..17] let extended_tokens: Vec = (0..16).collect(); - let (mut seq2, _) = - ActiveSequence::new_with_signal(extended_tokens, 100, Some(16), Some(256)); + let (mut seq2, _) = ActiveSequence::new_with_signal(extended_tokens, 100, Some(16)); seq2.push(16); seq2.pop(); seq2.push(16); @@ -363,8 +350,7 @@ mod tests { fn test_active_sequence_generate_signals() { // Create a sequence with block size 16, max_output_tokens 4, initialized with tokens [0..14) let initial_tokens: Vec = (0..14).collect(); - let (mut seq, signal) = - ActiveSequence::new_with_signal(initial_tokens, 5, Some(16), Some(256)); + let (mut seq, signal) = ActiveSequence::new_with_signal(initial_tokens, 5, Some(16)); // Initial signal - should have received a Use signal for the partial block assert!(signal.is_some()); From cccebad010578507075b571f7e142f960109e2e8 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Wed, 28 May 2025 00:29:58 -0700 Subject: [PATCH 04/36] cleaner scheduling + generation separation, and waterline bug fix --- lib/llm/src/mocker/kv_manager.rs | 52 ++----- lib/llm/src/mocker/protocols.rs | 1 + lib/llm/src/mocker/scheduler.rs | 227 +++++++++++++++---------------- 3 files changed, 121 insertions(+), 159 deletions(-) diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index caa6fda110..71522dcb58 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -178,7 +178,7 @@ impl KvManager { pub fn probe_new_blocks(&self, blocks: &[UniqueBlock]) -> usize { blocks .iter() - .filter(|&block| !self.all_blocks.contains(block)) + .filter(|&block| !self.active_blocks.contains_key(block)) .count() } @@ -221,57 +221,21 @@ impl KvManager { } /// Check if a sequence can be scheduled and calculate cost if possible - pub fn try_schedule( - &self, - sequence: &ActiveSequence, - watermark: f64, - tokens_budget: usize, - ) -> Option { - // Return None immediately if tokens_budget is 0 - if tokens_budget == 0 { - return None; - } - - // Get unique blocks from the sequence - let unique_blocks = sequence.unique_blocks(); - - // Get the count of new blocks - let new_blocks = self.probe_new_blocks(unique_blocks); - - // Calculate current usage and available capacity - let active_count = self.active_blocks.len(); - - // Check if we can schedule based on the watermark - if (active_count + new_blocks) as f64 > (1.0 - watermark) * self.max_capacity as f64 { - return None; - } - - // Calculate overlap blocks - let overlap_blocks = unique_blocks.len() - new_blocks; - - // Calculate new tokens + pub fn get_prefill_cost(&self, sequence: &ActiveSequence) -> PrefillCost { + let seq_blocks = sequence.unique_blocks(); + let new_blocks = self.probe_new_blocks(seq_blocks); + let overlap_blocks = seq_blocks.len() - new_blocks; let new_tokens = sequence.num_input_tokens() - overlap_blocks * self.block_size; - // // Print the full equation with actual values substituted - // println!("{} = {} - ({} * {}) (new_tokens = num_input_tokens - overlap_blocks * block_size)", - // new_tokens, - // sequence.num_input_tokens(), - // overlap_blocks, - // self.block_size); - - // Return None if new_tokens exceeds tokens_budget - if new_tokens > tokens_budget { - return None; - } - // Calculate prefill compute let prefill_compute = 1.25e-6 * (new_tokens as f64).powi(2) + 7.41e-2 * (new_tokens as f64) + 2.62e1; - Some(PrefillCost { + PrefillCost { + new_blocks, new_tokens, prefill_compute, - }) + } } } diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs index 2b551db61b..51440ee9d4 100644 --- a/lib/llm/src/mocker/protocols.rs +++ b/lib/llm/src/mocker/protocols.rs @@ -57,6 +57,7 @@ pub struct DirectRequest { /// Represents the cost of prefilling content in the cache #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PrefillCost { + pub new_blocks: usize, pub new_tokens: usize, pub prefill_compute: f64, } diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index e01b677e7e..5f6776a9b1 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -63,8 +63,8 @@ pub enum Request { #[derive(Default)] struct SchedulerState { waiting: VecDeque, - ready: VecDeque, - running: LRUEvictor, + prefill: VecDeque, + decode: LRUEvictor, requests: HashMap, prefill_costs: HashMap>, } @@ -74,61 +74,66 @@ impl SchedulerState { fn receive(&mut self, request: DirectRequest) -> Uuid { // Use the provided UUID if available, otherwise generate a new one let uuid = request.uuid.unwrap_or_else(Uuid::new_v4); - - // Add the request to the map and waiting queue self.requests.insert(uuid, Request::Direct(request)); self.waiting.push_back(uuid); uuid } /// Get the next UUID from ready or waiting queue and its associated Request. - /// Returns from ready if not empty, otherwise from waiting, or None if both are empty. - /// Also removes the Request from the requests HashMap. fn next(&mut self) -> Option<(Uuid, Request)> { - let uuid = self - .ready - .pop_front() - .or_else(|| self.waiting.pop_front())?; - let request = self.requests.remove(&uuid)?; + let uuid = self.waiting.pop_front()?; + let request = self + .requests + .remove(&uuid) + .expect("Request does not exist."); Some((uuid, request)) } + /// Move a UUID and its Request to the waiting queue (front). + fn first_in_line(&mut self, uuid: Uuid, request: Request) { + self.requests.insert(uuid, request); + self.waiting.push_front(uuid); + } + /// Move a UUID and its Request to the ready queue. - fn make_ready(&mut self, uuid: Uuid, active_seq: ActiveSequence) { + fn start_prefill(&mut self, uuid: Uuid, active_seq: ActiveSequence, cost: Option) { self.requests.insert(uuid, Request::Active(active_seq)); - self.ready.push_back(uuid); + self.prefill.push_back(uuid); + self.prefill_costs.insert(uuid, cost); } - /// Schedule the request with the given UUID. - /// Returns the creation signal from the ActiveSequence. - fn run(&mut self, uuid: Uuid, active_seq: ActiveSequence) -> MoveBlock { - // Insert the request into the map - self.requests.insert(uuid, Request::Active(active_seq)); + /// Pop from prefill queue and move to decode queue. + /// Returns the prefill_compute value if available. + fn start_decode(&mut self) -> Option<(f64, MoveBlock)> { + let uuid = self.prefill.pop_front()?; + self.decode.insert(uuid); + + // Remove and extract prefill_compute from prefill_costs + let prefill_cost = self + .prefill_costs + .remove(&uuid) + .flatten() + .expect("Expects valid prefill cost."); - // Get the creation signal let Some(Request::Active(sequence)) = self.requests.get(&uuid) else { - panic!("Failed to get ActiveSequence for UUID"); - }; - let Some(signal) = sequence.creation_signal() else { - panic!("Failed to get creation signal from ActiveSequence"); + panic!("Request does not exist."); }; + let creation_signal = sequence + .creation_signal() + .clone() + .expect("Must have creation signal."); - // Add to running requests - self.running.insert(uuid); - signal.clone() - } - - /// Set the prefill cost for a UUID - fn set_prefill_cost(&mut self, uuid: Uuid, cost: Option) { - self.prefill_costs.insert(uuid, cost); + Some((prefill_cost.prefill_compute, creation_signal)) } - /// Get the prefill compute value for a UUID if available - fn get_prefill_compute(&self, uuid: &Uuid) -> Option { - self.prefill_costs - .get(uuid) - .and_then(|cost| cost.as_ref()) - .map(|cost| cost.prefill_compute) + fn run(&mut self, uuid: Uuid) -> Option<&mut ActiveSequence> { + if !self.decode.contains(&uuid) { + return None; + } + let Some(Request::Active(sequence)) = self.requests.get_mut(&uuid) else { + panic!("Request does not exist."); + }; + Some(sequence) } /// Calculate the current running batched tokens @@ -145,7 +150,7 @@ impl SchedulerState { /// Remove a UUID and its associated Request from collections. fn complete(&mut self, uuid: &Uuid) { // println!("Request {} will complete", uuid); - self.running.remove(uuid); + self.decode.remove(uuid); self.requests.remove(uuid); self.prefill_costs.remove(uuid); } @@ -153,30 +158,29 @@ impl SchedulerState { /// Preempt the oldest running request by evicting it from running, resetting the sequence, /// and adding it back to the waiting queue. /// Returns the signal from reset_with_signal or None if no requests are running. - fn preempt(&mut self) -> Option> { + fn preempt(&mut self) -> Vec { // Evict the oldest UUID from running - let uuid = self.running.evict()?; - eprintln!("Request {} will be preempted", uuid); - - // Remove the request from the requests HashMap and ensure it's an ActiveSequence - let request = self.requests.remove(&uuid)?; - - // Remove the prefill cost to force recomputation + let uuid = self + .decode + .evict() + .expect("Nothing to evict for preemption."); + let request = self + .requests + .remove(&uuid) + .expect("Request does not exist."); self.prefill_costs.remove(&uuid); + eprintln!("Request {} will be preempted", uuid); // Extract the ActiveSequence from the Request enum + // Reset the sequence and get the new sequence and signal + // Insert the new sequence back into the requests map and add to waiting queue let Request::Active(mut active_sequence) = request else { panic!("Expected ActiveSequence in running queue") }; - - // Reset the sequence and get the new sequence and signal let signals = active_sequence.reset_with_signal(); + self.first_in_line(uuid, Request::Active(active_sequence)); - // Insert the new sequence back into the requests map and add to waiting queue - self.requests.insert(uuid, Request::Active(active_sequence)); - self.waiting.push_front(uuid); - - Some(signals) + signals } } @@ -191,20 +195,19 @@ pub struct Scheduler { impl Scheduler { /// Create a new Scheduler with the given parameters pub fn new( - kv_capacity: usize, - watermark: f64, + num_gpu_blocks: usize, block_size: usize, + max_num_batched_tokens: Option, + watermark: Option, speedup_ratio: Option, output_tx: Option>, cancellation_token: Option, ) -> Self { - // Create KvManager internally - let kv_manager = KvManager::new(kv_capacity, block_size); + let max_num_batched_tokens = max_num_batched_tokens.unwrap_or(8192); + let watermark = watermark.unwrap_or(0.01); - let token_capacity: usize = 8192; let state = Arc::new(Mutex::new(SchedulerState::default())); - - let kv_manager = Arc::new(Mutex::new(kv_manager)); + let kv_manager = Arc::new(Mutex::new(KvManager::new(num_gpu_blocks, block_size))); // Assert speedup_ratio is greater than 0 if provided if let Some(ratio) = speedup_ratio { @@ -219,19 +222,17 @@ impl Scheduler { // Create channel for request handling let (request_tx, mut request_rx) = mpsc::channel::(1024); - // Use provided cancellation token or create new one - let cancellation_token = cancellation_token.unwrap_or_default(); - let token_clone = cancellation_token.clone(); - // Create a clone for the background task let state_clone = state.clone(); let kv_manager_clone = kv_manager.clone(); let output_tx_clone = output_tx.clone(); + let cancel_token_clone = cancellation_token.unwrap_or_default().clone(); // Spawn main background task with cancellation token tokio::spawn(async move { - let mut schedule_interval = interval(Duration::from_millis(5)); - let mut simulate_interval = interval(Duration::from_millis(1)); + let mut schedule_interval = interval(Duration::from_secs_f64(1e-3)); + let mut simulate_interval = interval(Duration::from_secs_f64(1e-4)); + let mut should_schedule = true; loop { tokio::select! { @@ -243,35 +244,45 @@ impl Scheduler { state.receive(request); } - // Try Scheduling Requests + // Try Scheduling Requests - runs on normal interval or after simulation _ = schedule_interval.tick() => { + // Skip if we just ran scheduling after simulation to prevent consecutive runs + if !should_schedule { + continue; + } + let mut state_guard = state_clone.lock().await; - let mut kv_manager_guard = kv_manager_clone.lock().await; + let kv_manager_guard = kv_manager_clone.lock().await; // Process DirectRequests, converting them to ActiveSequence and scheduling them until we can't // schedule anymore. + let mut current_blocks = kv_manager_guard.num_active_blocks(); + let mut current_tokens = state_guard.num_batched_tokens(); while let Some((uuid, request)) = state_guard.next() { let active_sequence = get_active_sequence(request, block_size); - // Calculate token budget using new_tokens from PrefillCost - let total_prefill_tokens = state_guard.num_batched_tokens(); - let tokens_budget = token_capacity.saturating_sub(total_prefill_tokens); + // Update predictive budgets + let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence); + let new_blocks = prefill_cost.new_blocks; + let new_tokens = prefill_cost.new_tokens; + current_blocks += new_blocks; + current_tokens += new_tokens; // Check if it can be scheduled - let Some(prefill_cost) = kv_manager_guard.try_schedule(&active_sequence, watermark, tokens_budget) else { - state_guard.make_ready(uuid, active_sequence); + let under_block_budget = current_blocks as f64 <= (1. - watermark) * kv_manager_guard.max_capacity() as f64; + let under_token_budget = current_tokens <= max_num_batched_tokens; + if under_block_budget && under_token_budget { + state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost)); + should_schedule = false; + } else { + state_guard.first_in_line(uuid, Request::Active(active_sequence)); break; - }; - - // Get creation signal and schedule the request - let signal = state_guard.run(uuid, active_sequence); - kv_manager_guard.process(&signal); - state_guard.set_prefill_cost(uuid, Some(prefill_cost)); + } } } // Check for cancellation - _ = token_clone.cancelled() => { + _ = cancel_token_clone.cancelled() => { break; } @@ -285,44 +296,36 @@ impl Scheduler { let decoding_time = -5.47 * active_perc.powi(2) + 43.88 * active_perc + 19.44; let mut total_time = Duration::from_secs_f64(decoding_time / 1000.0); - // Process each running request - let uuids: Vec = state_guard.running.keys().cloned().collect(); - for uuid in uuids { - // Check if UUID is still in running_requests, if not skip this iteration - if !state_guard.running.contains(&uuid) { - continue; + // Process prefilling + while let Some((prefill_compute, creation_signal)) = state_guard.start_decode() { + // NOTE: Prefill cost/time is always incremented for new blocks, even if they + // could be cached by other requests in the same batch. This matches vLLM behavior. + total_time += Duration::from_secs_f64(prefill_compute / 1000.0); + let prefill_success = process_signals(&mut kv_manager_guard, std::slice::from_ref(&creation_signal)); + if !prefill_success { + panic!("Block allocation for prefilling cannot fail."); } + } - // Get prefill compute value first - let prefill_compute = state_guard.get_prefill_compute(&uuid).unwrap_or(0.); - - // Get the active sequence for this UUID - let sequence = state_guard.requests.get_mut(&uuid) - .and_then(|req| if let Request::Active(seq) = req { Some(seq) } else { None }) - .expect("UUID in running_requests must have a corresponding active sequence"); - - // Generate token and get signals + // Process decoding + let uuids: Vec = state_guard.decode.keys().cloned().collect(); + if !uuids.is_empty() {should_schedule = true}; + for uuid in uuids { + let Some(sequence) = state_guard.run(uuid) else { + continue; + }; let signals = sequence.generate(); // Process all signals with the KvManager // Handling of preemption on failure if !process_signals(&mut kv_manager_guard, &signals) { sequence.pop(); // revert the failed generation op - - // free_signal derefs the preempted blocks - let Some(free_signal) = state_guard.preempt() else { - panic!("Failed to acquire signal to free KV blocks from preemption"); - }; - - for signal in free_signal { + for signal in state_guard.preempt() { kv_manager_guard.process(&signal); } continue; } - // Accumulate sleep duration based on prefill_compute if available - total_time += Duration::from_secs_f64(prefill_compute / 1000.0); - // Send UUID notification for each generated token if let Some(tx) = &output_tx_clone { let _ = tx.try_send(uuid); @@ -333,11 +336,6 @@ impl Scheduler { state_guard.complete(&uuid); continue; } - - // Transition to decode (no prefill cost) - if sequence.generated_tokens() == 1 { - state_guard.set_prefill_cost(uuid, None); - } } // Sleep once for the adjusted duration @@ -371,7 +369,7 @@ impl Scheduler { /// Get the count of running requests pub async fn running_count(&self) -> usize { let state = self.state.lock().await; - state.running.len() + state.decode.len() } /// Get the current capacity of the KvManager @@ -397,7 +395,7 @@ impl Scheduler { }; ForwardPassMetrics { - request_active_slots: state.running.len() as u64, + request_active_slots: state.decode.len() as u64, request_total_slots: 420, // Dummy value as specified kv_active_blocks: active_blocks_count, kv_total_blocks: total_capacity, @@ -476,7 +474,6 @@ mod tests { std::env::set_var("RUST_LOG", "debug"); let kv_capacity: usize = 500; - let watermark: f64 = 0.01; // 1% watermark let block_size: usize = 64; let num_requests: usize = 100; let input_len: usize = 1000; @@ -488,8 +485,9 @@ mod tests { // Create scheduler with internal KvManager let scheduler = Scheduler::new( kv_capacity, - watermark, block_size, + None, + None, Some(10.0), // speedup_ratio Some(output_tx), None, @@ -548,7 +546,6 @@ mod tests { // Manual debug ticker that prints forward pass metrics _ = debug_interval.tick() => { let _metrics = scheduler.get_forward_pass_metrics().await; - println!("Forward Pass Metrics: {:#?}", _metrics); } Some(_) = output_rx.recv() => { From 394c2bf471c084a4322f902266c7b5d4a7e5e6bd Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Thu, 29 May 2025 17:39:41 -0700 Subject: [PATCH 05/36] restore printing out fwd pass metrics in test --- lib/llm/src/mocker/scheduler.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 5f6776a9b1..ddf83e4d11 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -546,6 +546,7 @@ mod tests { // Manual debug ticker that prints forward pass metrics _ = debug_interval.tick() => { let _metrics = scheduler.get_forward_pass_metrics().await; + println!("Forward Pass Metrics: {:#?}", _metrics); } Some(_) = output_rx.recv() => { From dad183f81503c973b31c96dee3c75435b2e82747 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Wed, 11 Jun 2025 00:42:03 -0700 Subject: [PATCH 06/36] multi-dp mocker engine --- lib/llm/src/mocker.rs | 1 + lib/llm/src/mocker/engine.rs | 251 ++++++++++++++++++++++++++++++++ lib/llm/src/mocker/protocols.rs | 38 +++++ lib/llm/src/mocker/scheduler.rs | 83 ++++++----- 4 files changed, 334 insertions(+), 39 deletions(-) create mode 100644 lib/llm/src/mocker/engine.rs diff --git a/lib/llm/src/mocker.rs b/lib/llm/src/mocker.rs index 2a9e63a9e2..4315868c49 100644 --- a/lib/llm/src/mocker.rs +++ b/lib/llm/src/mocker.rs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod engine; pub mod evictor; pub mod kv_manager; pub mod protocols; diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs new file mode 100644 index 0000000000..ec70cd4ea2 --- /dev/null +++ b/lib/llm/src/mocker/engine.rs @@ -0,0 +1,251 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! MockSchedulerEngine - AsyncEngine wrapper around the Scheduler +//! +//! This module provides an AsyncEngine implementation that wraps the Scheduler +//! to provide streaming token generation with realistic timing simulation. + +use crate::mocker::protocols::{DirectRequest, MockEngineArgs, OutputSignal}; +use crate::mocker::scheduler::Scheduler; + +use dynamo_runtime::{ + engine::AsyncEngineContextProvider, + pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn}, + protocols::annotated::Annotated, +}; + +use rand::Rng; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::{mpsc, Mutex}; +use tokio_stream::wrappers::ReceiverStream; +use uuid::Uuid; + +/// Generate a random printable character +fn generate_random_char() -> String { + let mut rng = rand::rng(); + let selection = match rng.random_range(0..4) { + 0 => ('a'..='z').nth(rng.random_range(0..26)).unwrap(), // lowercase + 1 => ('A'..='Z').nth(rng.random_range(0..26)).unwrap(), // uppercase + 2 => ('0'..='9').nth(rng.random_range(0..10)).unwrap(), // digits + _ => [' ', '.', ',', '!', '?'][rng.random_range(0..5)], // punctuation/space + }; + selection.to_string() +} + +/// AsyncEngine wrapper around the Scheduler that generates random character tokens +pub struct MockVllmEngine { + schedulers: Vec, + active_requests: Arc>>>, + dp_size: u32, +} + +impl MockVllmEngine { + /// Create a new MockVllmEngine with the given parameters + pub fn new(args: MockEngineArgs) -> Self { + let mut schedulers = Vec::new(); + let active_requests = Arc::new(Mutex::new( + HashMap::>::new(), + )); + + // Create multiple schedulers and their background tasks + for _ in 0..args.dp_size { + // Create a shared output channel that this scheduler will use + let (output_tx, output_rx) = mpsc::channel::(1024); + + let scheduler = Scheduler::new( + args.clone(), + Some(output_tx), + None, // No global cancellation token + ); + + schedulers.push(scheduler); + + // Spawn a background task for this scheduler to distribute token notifications to active requests + let output_rx = Arc::new(Mutex::new(output_rx)); + let active_requests_clone = active_requests.clone(); + + tokio::spawn(async move { + loop { + let signal = { + let mut rx = output_rx.lock().await; + match rx.recv().await { + Some(signal) => signal, + None => break, // Channel closed + } + }; + + // Notify the specific request that a token was generated + let active = active_requests_clone.lock().await; + if let Some(request_tx) = active.get(&signal.uuid) { + let _ = request_tx.send(signal).await; + } + } + }); + } + + Self { + schedulers, + active_requests, + dp_size: args.dp_size, + } + } +} + +#[async_trait] +impl AsyncEngine, ManyOut>, Error> for MockVllmEngine { + async fn generate( + &self, + input: SingleIn, + ) -> Result>, Error> { + let (mut request, ctx) = input.into_parts(); + + let dp_rank = request.dp_rank.unwrap_or(0); + + // Validate dp_rank + if dp_rank >= self.dp_size { + return Err(Error::msg(format!( + "dp_rank {} is out of bounds for dp_size {}", + dp_rank, self.dp_size + ))); + } + + let request_uuid = ctx.id().parse().unwrap_or(Uuid::new_v4()); + request.uuid = Some(request_uuid); + + let (request_tx, mut request_rx) = mpsc::channel::(64); + { + let mut active = self.active_requests.lock().await; + active.insert(request_uuid, request_tx); + } + + // Send the request to the appropriate scheduler based on dp_rank + self.schedulers[dp_rank as usize] + .receive(request.clone()) + .await; + + // Create a simple channel for the stream + let (stream_tx, stream_rx) = mpsc::channel::>(64); + + let active_requests = self.active_requests.clone(); + let async_context = ctx.context(); + + // Spawn a task to handle the complex async logic + tokio::spawn(async move { + loop { + tokio::select! { + Some(signal) = request_rx.recv() => { + if signal.completed { + break; + } + let output = generate_random_char(); + if stream_tx.send(Annotated::from_data(output)).await.is_err() { + break; + } + } + + _ = async_context.stopped() => { + break; + } + } + } + + // Clean up: remove this request from active requests + let mut active = active_requests.lock().await; + active.remove(&request_uuid); + }); + + // Create a simple ReceiverStream which is naturally Send + Sync + let stream = ReceiverStream::new(stream_rx); + Ok(ResponseStream::new(Box::pin(stream), ctx.context())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use dynamo_runtime::pipeline::Context; + use futures::StreamExt; + + #[tokio::test] + async fn test_multiple_workers_with_token_limit() { + const DP_SIZE: u32 = 2; + const TOKENS_PER_REQUEST: usize = 20; + + // Create the MockVllmEngine using builder pattern + let args = MockEngineArgs::builder() + .speedup_ratio(10.0) + .dp_size(DP_SIZE) + .build() + .unwrap(); + + let engine = MockVllmEngine::new(args); + + // Create 4 DirectRequests: 2 for worker 0, 2 for worker 1 + let requests = vec![ + DirectRequest { + tokens: vec![1, 2, 3, 4], + max_output_tokens: TOKENS_PER_REQUEST, + uuid: None, + dp_rank: Some(0), + }, + DirectRequest { + tokens: vec![5, 6, 7, 8], + max_output_tokens: TOKENS_PER_REQUEST, + uuid: None, + dp_rank: Some(0), + }, + DirectRequest { + tokens: vec![9, 10, 11, 12], + max_output_tokens: TOKENS_PER_REQUEST, + uuid: None, + dp_rank: Some(1), + }, + DirectRequest { + tokens: vec![13, 14, 15, 16], + max_output_tokens: TOKENS_PER_REQUEST, + uuid: None, + dp_rank: Some(1), + }, + ]; + + // Generate streams and collect all tokens from each + for request in requests { + let ctx = Context::new(request); + let stream = engine.generate(ctx).await.unwrap(); + + let tokens: Vec<_> = stream.collect().await; + + // Verify each stream produces exactly the expected number of tokens + assert_eq!(tokens.len(), TOKENS_PER_REQUEST); + + // Verify all tokens contain valid data + for token in tokens { + assert!(token.data.is_some()); + } + } + + // Give a small delay to ensure cleanup tasks complete + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Verify that active_requests is empty (all requests cleaned up) + let active_requests = engine.active_requests.lock().await; + assert!( + active_requests.is_empty(), + "Active requests should be empty after streams complete" + ); + } +} diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs index 51440ee9d4..fc66ce061b 100644 --- a/lib/llm/src/mocker/protocols.rs +++ b/lib/llm/src/mocker/protocols.rs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use derive_builder::Builder; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -52,6 +53,7 @@ pub struct DirectRequest { pub tokens: Vec, pub max_output_tokens: usize, pub uuid: Option, + pub dp_rank: Option, } /// Represents the cost of prefilling content in the cache @@ -62,6 +64,42 @@ pub struct PrefillCost { pub prefill_compute: f64, } +/// Signal for output token generation with completion status +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OutputSignal { + pub uuid: Uuid, + pub completed: bool, +} + +/// Configuration arguments for MockVllmEngine +#[derive(Debug, Clone, Serialize, Deserialize, Builder)] +#[builder(pattern = "owned", build_fn(public))] +pub struct MockEngineArgs { + #[builder(default = "16384")] + pub num_gpu_blocks: usize, + + #[builder(default = "64")] + pub block_size: usize, + + #[builder(default)] + pub max_num_batched_tokens: Option, + + #[builder(default = "0.01")] + pub watermark: f64, + + #[builder(default = "1.0")] + pub speedup_ratio: f64, + + #[builder(default = "1")] + pub dp_size: u32, +} + +impl MockEngineArgs { + pub fn builder() -> MockEngineArgsBuilder { + MockEngineArgsBuilder::default() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 094ed2910b..971e3288ba 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -43,8 +43,8 @@ use crate::kv_router::protocols::ForwardPassMetrics; use crate::mocker::evictor::LRUEvictor; use crate::mocker::kv_manager::KvManager; -use crate::mocker::protocols::DirectRequest; -use crate::mocker::protocols::{MoveBlock, PrefillCost, UniqueBlock}; +use crate::mocker::protocols::{DirectRequest, MockEngineArgs}; +use crate::mocker::protocols::{MoveBlock, OutputSignal, PrefillCost, UniqueBlock}; use crate::mocker::sequence::ActiveSequence; use std::collections::HashMap; use std::collections::VecDeque; @@ -195,29 +195,22 @@ pub struct Scheduler { impl Scheduler { /// Create a new Scheduler with the given parameters pub fn new( - num_gpu_blocks: usize, - block_size: usize, - max_num_batched_tokens: Option, - watermark: Option, - speedup_ratio: Option, - output_tx: Option>, + args: MockEngineArgs, + output_tx: Option>, cancellation_token: Option, ) -> Self { - let max_num_batched_tokens = max_num_batched_tokens.unwrap_or(8192); - let watermark = watermark.unwrap_or(0.01); - let state = Arc::new(Mutex::new(SchedulerState::default())); - let kv_manager = Arc::new(Mutex::new(KvManager::new(num_gpu_blocks, block_size))); - - // Assert speedup_ratio is greater than 0 if provided - if let Some(ratio) = speedup_ratio { - assert!( - ratio > 0.0, - "speedup_ratio must be greater than 0, got: {}", - ratio - ); - } - let speedup_ratio = speedup_ratio.unwrap_or(1.0); + let kv_manager = Arc::new(Mutex::new(KvManager::new( + args.num_gpu_blocks, + args.block_size, + ))); + + // Assert speedup_ratio is greater than 0 + assert!( + args.speedup_ratio > 0.0, + "speedup_ratio must be greater than 0, got: {}", + args.speedup_ratio + ); // Create channel for request handling let (request_tx, mut request_rx) = mpsc::channel::(1024); @@ -259,7 +252,7 @@ impl Scheduler { let mut current_blocks = kv_manager_guard.num_active_blocks(); let mut current_tokens = state_guard.num_batched_tokens(); while let Some((uuid, request)) = state_guard.next() { - let active_sequence = get_active_sequence(request, block_size); + let active_sequence = get_active_sequence(request, args.block_size); // Update predictive budgets let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence); @@ -269,8 +262,8 @@ impl Scheduler { current_tokens += new_tokens; // Check if it can be scheduled - let under_block_budget = current_blocks as f64 <= (1. - watermark) * kv_manager_guard.max_capacity() as f64; - let under_token_budget = current_tokens <= max_num_batched_tokens; + let under_block_budget = current_blocks as f64 <= (1. - args.watermark) * kv_manager_guard.max_capacity() as f64; + let under_token_budget = args.max_num_batched_tokens.is_none_or(|limit| current_tokens <= limit); if under_block_budget && under_token_budget { state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost)); should_schedule = false; @@ -328,18 +321,29 @@ impl Scheduler { // Send UUID notification for each generated token if let Some(tx) = &output_tx_clone { - let _ = tx.try_send(uuid); + let signal = OutputSignal { + uuid, + completed: false, + }; + let _ = tx.try_send(signal); } // Check if we're done after generating if sequence.generated_tokens() >= sequence.max_output_tokens() { + if let Some(tx) = &output_tx_clone { + let signal = OutputSignal { + uuid, + completed: true, + }; + let _ = tx.try_send(signal); + } state_guard.complete(&uuid); continue; } } // Sleep once for the adjusted duration - let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / speedup_ratio); + let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / args.speedup_ratio); if adjusted_time.as_millis() > 0 { tokio::time::sleep(adjusted_time).await; } @@ -481,18 +485,18 @@ mod tests { let max_output_tokens: usize = 100; // Create channel for token output - let (output_tx, mut output_rx) = mpsc::channel::(1024); - - // Create scheduler with internal KvManager - let scheduler = Scheduler::new( - kv_capacity, - block_size, - None, - None, - Some(10.0), // speedup_ratio - Some(output_tx), - None, - ); + let (output_tx, mut output_rx) = mpsc::channel::(1024); + + // Create scheduler args using builder + let args = MockEngineArgs::builder() + .num_gpu_blocks(kv_capacity) + .block_size(block_size) + .speedup_ratio(10.0) + .build() + .unwrap(); + + // Create scheduler with new args struct + let scheduler = Scheduler::new(args, Some(output_tx), None); // Create shared tokens for caching case let shared_tokens = if use_shared_tokens { @@ -523,6 +527,7 @@ mod tests { tokens: input_tokens, max_output_tokens, uuid: None, + dp_rank: None, }; scheduler.receive(request).await; } From 009ec7874d33aa3e5de3f1b3df25b9803970357d Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Wed, 11 Jun 2025 19:25:43 -0700 Subject: [PATCH 07/36] fixed prefill cost, and more conservative watermarking --- lib/llm/src/mocker/engine.rs | 3 +- lib/llm/src/mocker/kv_manager.rs | 3 +- lib/llm/src/mocker/protocols.rs | 3 +- lib/llm/src/mocker/scheduler.rs | 163 ++++++++++++++++++++++++++++--- 4 files changed, 155 insertions(+), 17 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index ec70cd4ea2..21abaf3a1e 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -62,12 +62,13 @@ impl MockVllmEngine { )); // Create multiple schedulers and their background tasks - for _ in 0..args.dp_size { + for dp_rank in 0..args.dp_size { // Create a shared output channel that this scheduler will use let (output_tx, output_rx) = mpsc::channel::(1024); let scheduler = Scheduler::new( args.clone(), + Some(dp_rank), Some(output_tx), None, // No global cancellation token ); diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index 71522dcb58..ca2d6fc50a 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -178,7 +178,8 @@ impl KvManager { pub fn probe_new_blocks(&self, blocks: &[UniqueBlock]) -> usize { blocks .iter() - .filter(|&block| !self.active_blocks.contains_key(block)) + // .filter(|&block| !self.active_blocks.contains_key(block)) + .filter(|&block| !self.all_blocks.contains(block)) .count() } diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs index fc66ce061b..4f59e2d8dc 100644 --- a/lib/llm/src/mocker/protocols.rs +++ b/lib/llm/src/mocker/protocols.rs @@ -81,7 +81,8 @@ pub struct MockEngineArgs { #[builder(default = "64")] pub block_size: usize, - #[builder(default)] + // default for open api server, for llm class it's 16384 + #[builder(default = Some(8192))] pub max_num_batched_tokens: Option, #[builder(default = "0.01")] diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 971e3288ba..5749db730a 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -178,6 +178,10 @@ impl SchedulerState { panic!("Expected ActiveSequence in running queue") }; let signals = active_sequence.reset_with_signal(); + + // Note: For preemption, we don't compute hit rate since we don't have access to new_tokens + // and the sequence is being reset anyway. Hit rate tracking is primarily for new scheduling attempts. + self.first_in_line(uuid, Request::Active(active_sequence)); signals @@ -187,15 +191,18 @@ impl SchedulerState { /// Manages scheduling of requests using KvManager resources #[derive(Clone)] pub struct Scheduler { + dp_rank: Option, state: Arc>, kv_manager: Arc>, request_tx: mpsc::Sender, + hit_rates: Arc>>, } impl Scheduler { /// Create a new Scheduler with the given parameters pub fn new( args: MockEngineArgs, + dp_rank: Option, output_tx: Option>, cancellation_token: Option, ) -> Self { @@ -204,6 +211,7 @@ impl Scheduler { args.num_gpu_blocks, args.block_size, ))); + let hit_rates = Arc::new(Mutex::new(VecDeque::with_capacity(1000))); // Assert speedup_ratio is greater than 0 assert!( @@ -220,6 +228,7 @@ impl Scheduler { let kv_manager_clone = kv_manager.clone(); let output_tx_clone = output_tx.clone(); let cancel_token_clone = cancellation_token.unwrap_or_default().clone(); + let hit_rates_clone = hit_rates.clone(); // Spawn main background task with cancellation token tokio::spawn(async move { @@ -256,7 +265,8 @@ impl Scheduler { // Update predictive budgets let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence); - let new_blocks = prefill_cost.new_blocks; + let new_tokens = active_sequence.len(); + let new_blocks = (new_tokens + 1) / args.block_size; // this is conservative, assumes no cache hit let new_tokens = prefill_cost.new_tokens; current_blocks += new_blocks; current_tokens += new_tokens; @@ -264,13 +274,27 @@ impl Scheduler { // Check if it can be scheduled let under_block_budget = current_blocks as f64 <= (1. - args.watermark) * kv_manager_guard.max_capacity() as f64; let under_token_budget = args.max_num_batched_tokens.is_none_or(|limit| current_tokens <= limit); - if under_block_budget && under_token_budget { - state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost)); - should_schedule = false; - } else { + + // Cannot schedule, put first in line instead + if !(under_block_budget && under_token_budget) { state_guard.first_in_line(uuid, Request::Active(active_sequence)); break; } + + // Compute and store hit rate + let hit_rate = (!active_sequence.is_empty()) + .then(|| 1.0 - (new_tokens as f32 / active_sequence.len() as f32)) + .unwrap_or(0.0); + { + let mut hit_rates_guard = hit_rates_clone.lock().await; + hit_rates_guard.push_back(hit_rate); + if hit_rates_guard.len() > 1000 { + hit_rates_guard.pop_front(); + } + } + + state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost)); + should_schedule = false; } } @@ -343,6 +367,8 @@ impl Scheduler { } // Sleep once for the adjusted duration + drop(kv_manager_guard); + drop(state_guard); let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / args.speedup_ratio); if adjusted_time.as_millis() > 0 { tokio::time::sleep(adjusted_time).await; @@ -353,9 +379,11 @@ impl Scheduler { }); Self { + dp_rank, state, kv_manager, request_tx, + hit_rates, } } @@ -384,30 +412,44 @@ impl Scheduler { /// Returns forward pass metrics for monitoring purposes pub async fn get_forward_pass_metrics(&self) -> ForwardPassMetrics { + // Acquire all locks in consistent order: state -> kv_manager -> hit_rates let state = self.state.lock().await; let kv_manager = self.kv_manager.lock().await; + let hit_rates_guard = self.hit_rates.lock().await; - // Get the active blocks and total capacity from KvManager + // Get state metrics + let request_active_slots = state.decode.len() as u64; + let num_requests_waiting = state.waiting.len() as u64; + + // Get KV manager metrics let active_blocks_count = kv_manager.active_blocks().len() as u64; let total_capacity = kv_manager.max_capacity() as u64; - - // Calculate GPU cache usage percentage let gpu_cache_usage_perc = if total_capacity > 0 { active_blocks_count as f32 / total_capacity as f32 } else { 0.0 }; + // Get hit rate metrics + let gpu_prefix_cache_hit_rate = if hit_rates_guard.is_empty() { + 0.0 + } else { + let sum: f32 = hit_rates_guard.iter().sum(); + sum / hit_rates_guard.len() as f32 + }; + ForwardPassMetrics { - data_parallel_rank: None, // Default for backwards compatibility - request_active_slots: state.decode.len() as u64, - request_total_slots: 420, // Dummy value as specified + data_parallel_rank: self.dp_rank, + request_active_slots, + // vllm max_num_seqs for gpu >= 70 vram, otherwise 256, fallback is 128 + request_total_slots: 1024, kv_active_blocks: active_blocks_count, kv_total_blocks: total_capacity, - num_requests_waiting: state.waiting.len() as u64, + num_requests_waiting, gpu_cache_usage_perc, - gpu_prefix_cache_hit_rate: 0.0, // Placeholder value as specified + gpu_prefix_cache_hit_rate, } + // Guards drop naturally here in reverse order (LIFO): hit_rates_guard, kv_manager, state } } @@ -496,7 +538,7 @@ mod tests { .unwrap(); // Create scheduler with new args struct - let scheduler = Scheduler::new(args, Some(output_tx), None); + let scheduler = Scheduler::new(args, None, Some(output_tx), None); // Create shared tokens for caching case let shared_tokens = if use_shared_tokens { @@ -588,4 +630,97 @@ mod tests { expected_tokens ); } + + #[tokio::test] + async fn test_cache_hit_rate_with_identical_requests() { + let block_size: usize = 64; + let max_output_tokens: usize = 10; + let speedup_ratio = 10.0; + let num_requests = 10; + let token_length = 65; + + // Create channel for token output + let (output_tx, mut output_rx) = mpsc::channel::(1024); + + // Create scheduler args + let args = MockEngineArgs::builder() + .num_gpu_blocks(1000) // Large enough to not be a constraint + .block_size(block_size) + .speedup_ratio(speedup_ratio) + .build() + .unwrap(); + + // Create scheduler + let scheduler = Scheduler::new(args, None, Some(output_tx), None); + + // Create identical tokens for all requests + let identical_tokens: Vec = (0..token_length).map(|i| i as u32).collect(); + + // Send all requests with identical tokens + for _ in 0..num_requests { + let request = DirectRequest { + tokens: identical_tokens.clone(), + max_output_tokens, + uuid: None, + dp_rank: None, + }; + scheduler.receive(request).await; + // Sleep for 0.1 second after each request + tokio::time::sleep(Duration::from_millis(100)).await; + } + + // Collect all generated tokens + let mut received_tokens = 0; + + // Set up a timeout that resets to 0.5 seconds on each received token + let timeout = tokio::time::sleep(Duration::from_millis(500)); + tokio::pin!(timeout); + + // Set up debug ticker interval + let mut debug_interval = interval(Duration::from_millis(500)); + + loop { + tokio::select! { + biased; + + // Manual debug ticker that prints forward pass metrics + _ = debug_interval.tick() => { + let _metrics = scheduler.get_forward_pass_metrics().await; + println!("Forward Pass Metrics: {:#?}", _metrics); + } + + Some(_signal) = output_rx.recv() => { + received_tokens += 1; + // Reset timeout whenever we receive a token + timeout.set(tokio::time::sleep(Duration::from_millis(500))); + } + + _ = &mut timeout => { + // Break when timeout occurs (no more tokens for 0.5 seconds) + break; + } + } + } + + // Verify forward pass metrics + let metrics = scheduler.get_forward_pass_metrics().await; + + assert_eq!( + metrics.num_requests_waiting, 0, + "Expected no waiting requests, got {}", + metrics.num_requests_waiting + ); + + assert!( + metrics.gpu_prefix_cache_hit_rate > 0.8, + "Expected cache hit rate > 0.8, got {}", + metrics.gpu_prefix_cache_hit_rate + ); + + println!( + "Test passed! Cache hit rate: {:.3}", + metrics.gpu_prefix_cache_hit_rate + ); + println!("Received {} tokens", received_tokens); + } } From ee11427f2159c5914906dffccb778ab0740da724 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Wed, 11 Jun 2025 23:55:14 -0700 Subject: [PATCH 08/36] fwd pass metrics --- lib/llm/src/mocker/engine.rs | 128 +++++++++++++++++++++++++++----- lib/llm/src/mocker/scheduler.rs | 6 +- 2 files changed, 111 insertions(+), 23 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 21abaf3a1e..9df68b6f52 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -18,19 +18,24 @@ //! This module provides an AsyncEngine implementation that wraps the Scheduler //! to provide streaming token generation with realistic timing simulation. +use crate::kv_router::publisher::WorkerMetricsPublisher; use crate::mocker::protocols::{DirectRequest, MockEngineArgs, OutputSignal}; use crate::mocker::scheduler::Scheduler; +use tokio_util::sync::CancellationToken; use dynamo_runtime::{ + component::Component, engine::AsyncEngineContextProvider, pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn}, protocols::annotated::Annotated, + Result, }; use rand::Rng; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::{mpsc, Mutex}; +use tokio::time::{interval, Duration}; use tokio_stream::wrappers::ReceiverStream; use uuid::Uuid; @@ -51,16 +56,47 @@ pub struct MockVllmEngine { schedulers: Vec, active_requests: Arc>>>, dp_size: u32, + cancel_token: CancellationToken, } impl MockVllmEngine { /// Create a new MockVllmEngine with the given parameters - pub fn new(args: MockEngineArgs) -> Self { - let mut schedulers = Vec::new(); + pub async fn new( + args: MockEngineArgs, + component: Option, + cancel_token: Option, + ) -> Result { let active_requests = Arc::new(Mutex::new( HashMap::>::new(), )); + let cancel_token = cancel_token.unwrap_or_default(); + + // Create schedulers and start their background tasks + let schedulers = + Self::start_schedulers(args.clone(), active_requests.clone(), cancel_token.clone()); + + // Start metrics publishing tasks + Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?; + + let engine = Self { + schedulers, + active_requests, + dp_size: args.dp_size, + cancel_token, + }; + + Ok(engine) + } + + /// Create schedulers and spawn their background tasks for distributing token notifications + fn start_schedulers( + args: MockEngineArgs, + active_requests: Arc>>>, + cancel_token: CancellationToken, + ) -> Vec { + let mut schedulers = Vec::new(); + // Create multiple schedulers and their background tasks for dp_rank in 0..args.dp_size { // Create a shared output channel that this scheduler will use @@ -70,7 +106,7 @@ impl MockVllmEngine { args.clone(), Some(dp_rank), Some(output_tx), - None, // No global cancellation token + Some(cancel_token.clone()), ); schedulers.push(scheduler); @@ -78,31 +114,80 @@ impl MockVllmEngine { // Spawn a background task for this scheduler to distribute token notifications to active requests let output_rx = Arc::new(Mutex::new(output_rx)); let active_requests_clone = active_requests.clone(); + let cancel_token_cloned = cancel_token.clone(); tokio::spawn(async move { loop { - let signal = { - let mut rx = output_rx.lock().await; - match rx.recv().await { - Some(signal) => signal, - None => break, // Channel closed + tokio::select! { + signal_result = async { + let mut rx = output_rx.lock().await; + rx.recv().await + } => { + let Some(signal) = signal_result else { + break; // Channel closed + }; + + // Notify the specific request that a token was generated + let active = active_requests_clone.lock().await; + if let Some(request_tx) = active.get(&signal.uuid) { + let _ = request_tx.send(signal).await; + } + } + _ = cancel_token_cloned.cancelled() => { + break; } - }; - - // Notify the specific request that a token was generated - let active = active_requests_clone.lock().await; - if let Some(request_tx) = active.get(&signal.uuid) { - let _ = request_tx.send(signal).await; } } }); } - Self { - schedulers, - active_requests, - dp_size: args.dp_size, + schedulers + } + + /// Start background tasks to poll and publish metrics every second + async fn start_metrics_publishing( + schedulers: &[Scheduler], + component: Option, + cancel_token: CancellationToken, + ) -> Result<()> { + let metrics_publisher = Arc::new(WorkerMetricsPublisher::new()?); + + if let Some(comp) = component { + metrics_publisher.create_endpoint(comp).await?; + } + + for (dp_rank, scheduler) in schedulers.iter().enumerate() { + let scheduler = scheduler.clone(); + let publisher = metrics_publisher.clone(); + let dp_rank = dp_rank as u32; + let cancel_token = cancel_token.clone(); + + tokio::spawn(async move { + let mut interval = interval(Duration::from_secs(1)); + + loop { + tokio::select! { + _ = interval.tick() => { + // Get metrics from scheduler + let metrics = scheduler.get_forward_pass_metrics().await; + + // Publish metrics + if let Err(e) = publisher.publish(Arc::new(metrics)) { + tracing::warn!("Failed to publish metrics for DP rank {}: {}", dp_rank, e); + } else { + tracing::trace!("Published metrics for DP rank {}", dp_rank); + } + } + _ = cancel_token.cancelled() => { + tracing::info!("Metrics publishing cancelled for DP rank {}", dp_rank); + break; + } + } + } + }); } + + Ok(()) } } @@ -143,6 +228,7 @@ impl AsyncEngine, ManyOut>, Error> for let active_requests = self.active_requests.clone(); let async_context = ctx.context(); + let cancel_token = self.cancel_token.clone(); // Spawn a task to handle the complex async logic tokio::spawn(async move { @@ -161,6 +247,10 @@ impl AsyncEngine, ManyOut>, Error> for _ = async_context.stopped() => { break; } + + _ = cancel_token.cancelled() => { + break; + } } } @@ -193,7 +283,7 @@ mod tests { .build() .unwrap(); - let engine = MockVllmEngine::new(args); + let engine = MockVllmEngine::new(args, None, None).await.unwrap(); // Create 4 DirectRequests: 2 for worker 0, 2 for worker 1 let requests = vec![ diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 5749db730a..368d21f55b 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -282,9 +282,7 @@ impl Scheduler { } // Compute and store hit rate - let hit_rate = (!active_sequence.is_empty()) - .then(|| 1.0 - (new_tokens as f32 / active_sequence.len() as f32)) - .unwrap_or(0.0); + let hit_rate = if !active_sequence.is_empty() { 1.0 - (new_tokens as f32 / active_sequence.len() as f32) } else { 0.0 }; { let mut hit_rates_guard = hit_rates_clone.lock().await; hit_rates_guard.push_back(hit_rate); @@ -644,7 +642,7 @@ mod tests { // Create scheduler args let args = MockEngineArgs::builder() - .num_gpu_blocks(1000) // Large enough to not be a constraint + .num_gpu_blocks(100) // Large enough to not be a constraint .block_size(block_size) .speedup_ratio(speedup_ratio) .build() From 8e8d0b4cec406e506147f94bbd490f884ff50493 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Fri, 13 Jun 2025 02:06:40 -0700 Subject: [PATCH 09/36] can emit kv event, not tested --- lib/llm/src/mocker/engine.rs | 1 + lib/llm/src/mocker/kv_manager.rs | 80 +++++++++++++++++++++++---- lib/llm/src/mocker/protocols.rs | 43 +++++++++++++-- lib/llm/src/mocker/scheduler.rs | 41 +++++++++++--- lib/llm/src/mocker/sequence.rs | 94 ++++++++++++++++++++++---------- 5 files changed, 208 insertions(+), 51 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 9df68b6f52..914a6c1ad0 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -106,6 +106,7 @@ impl MockVllmEngine { args.clone(), Some(dp_rank), Some(output_tx), + None, Some(cancel_token.clone()), ); diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index ca2d6fc50a..37721db04e 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -46,10 +46,11 @@ //! implementation of the main block manager. use crate::mocker::evictor::LRUEvictor; -use crate::mocker::protocols::{MoveBlock, PrefillCost, UniqueBlock}; +use crate::mocker::protocols::{MoveBlock, MoveBlockResponse, PrefillCost, UniqueBlock}; use crate::mocker::sequence::ActiveSequence; use derive_getters::Getters; use std::collections::{HashMap, HashSet}; +use tokio::sync::mpsc; #[derive(Getters)] pub struct KvManager { @@ -64,10 +65,20 @@ pub struct KvManager { inactive_blocks: LRUEvictor, all_blocks: HashSet, + + move_block_response_tx: Option>, } impl KvManager { pub fn new(max_capacity: usize, block_size: usize) -> Self { + Self::new_with_sender(max_capacity, block_size, None) + } + + pub fn new_with_sender( + max_capacity: usize, + block_size: usize, + move_block_response_tx: Option>, + ) -> Self { let active_blocks = HashMap::new(); let inactive_blocks = LRUEvictor::default(); let all_blocks = HashSet::new(); @@ -78,13 +89,39 @@ impl KvManager { active_blocks, inactive_blocks, all_blocks, + move_block_response_tx, + } + } + + /// Utility method to send block responses with optional reversing + fn send_block_response( + &self, + mut blocks: Vec, + reverse: bool, + store: bool, + parent_hash: Option, + ) { + if let Some(ref tx) = self.move_block_response_tx { + if !blocks.is_empty() { + if reverse { + blocks.reverse(); + } + let response = if store { + MoveBlockResponse::Store(blocks, parent_hash) + } else { + MoveBlockResponse::Remove(blocks) + }; + tx.send(response).unwrap(); + } } } /// Process a MoveBlock instruction synchronously pub fn process(&mut self, event: &MoveBlock) -> bool { match event { - MoveBlock::Use(hashes, _) => { + MoveBlock::Use(hashes) => { + let mut blocks_stored = Vec::::new(); + for hash in hashes { // First check if it already exists in active blocks if let Some(ref_count) = self.active_blocks.get_mut(hash) { @@ -106,30 +143,47 @@ impl KvManager { // If at max capacity, evict the oldest entry from inactive blocks if active_count + inactive_count >= self.max_capacity { - if let Some(evicted) = self.inactive_blocks.evict() { - // Remove evicted block from all_blocks - self.all_blocks.remove(&evicted); - } else { - // Cannot evict block, meaning no free blocks left in inactive pool - // Send a signal, scheduler would expect to handle preemption upon receiving this + let Some(evicted) = self.inactive_blocks.evict() else { return false; + }; + self.all_blocks.remove(&evicted); + if let UniqueBlock::FullBlock(evicted_full_block) = evicted { + self.send_block_response(vec![evicted_full_block], false, false, None); } } // Now insert the new block in active blocks with reference count 1 self.active_blocks.insert(hash.clone(), 1); - // Add to all_blocks as it's a new block self.all_blocks.insert(hash.clone()); + if self.move_block_response_tx.is_some() { + if let UniqueBlock::FullBlock(stored_full_block) = hash { + blocks_stored.push(*stored_full_block); + } + } } + self.send_block_response(blocks_stored, false, true, None); } + MoveBlock::Destroy(hashes) => { + let mut blocks_destroyed = Vec::::new(); + // Loop in inverse direction for hash in hashes.iter().rev() { self.active_blocks.remove(hash).unwrap(); // Remove from all_blocks when destroyed assert!(self.all_blocks.remove(hash)); + + // Track blocks for batch sending + if self.move_block_response_tx.is_some() { + if let UniqueBlock::FullBlock(destroyed_full_block) = hash { + blocks_destroyed.push(*destroyed_full_block); + } + } } + + self.send_block_response(blocks_destroyed, true, false, None); } + MoveBlock::Deref(hashes) => { // Loop in inverse direction for hash in hashes.iter().rev() { @@ -149,7 +203,8 @@ impl KvManager { } } } - MoveBlock::Promote(uuid, hash) => { + + MoveBlock::Promote(uuid, hash, parent_hash) => { let uuid_block = UniqueBlock::PartialBlock(*uuid); let hash_block = UniqueBlock::FullBlock(*hash); @@ -167,6 +222,7 @@ impl KvManager { // Update all_blocks assert!(self.all_blocks.remove(&uuid_block)); self.all_blocks.insert(hash_block); + self.send_block_response(vec![*hash], false, true, *parent_hash); } } @@ -252,7 +308,7 @@ mod tests { // Helper function to use multiple blocks that returns the response fn use_blocks(manager: &mut KvManager, ids: Vec) -> bool { let blocks = ids.into_iter().map(UniqueBlock::FullBlock).collect(); - manager.process(&MoveBlock::Use(blocks, None)) + manager.process(&MoveBlock::Use(blocks)) } // First use 10 blocks (0 to 9) in a batch @@ -279,7 +335,7 @@ mod tests { // Helper function to use multiple blocks fn use_blocks(manager: &mut KvManager, ids: Vec) { let blocks = ids.into_iter().map(UniqueBlock::FullBlock).collect(); - manager.process(&MoveBlock::Use(blocks, None)); + manager.process(&MoveBlock::Use(blocks)); } // Helper function to destroy multiple blocks diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs index 4f59e2d8dc..7fd8895594 100644 --- a/lib/llm/src/mocker/protocols.rs +++ b/lib/llm/src/mocker/protocols.rs @@ -17,9 +17,12 @@ use derive_builder::Builder; use serde::{Deserialize, Serialize}; use uuid::Uuid; +use crate::kv_router::protocols::{ + ExternalSequenceBlockHash, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData, + KvCacheStoredBlockData, LocalBlockHash, +}; + pub type Token = u32; -pub type LocalBlockHash = u64; -/// A global hash identifier for blocks pub type GlobalHash = u64; pub type NumBlocks = usize; @@ -40,12 +43,19 @@ impl Default for UniqueBlock { } /// Represents different block movement operations in the cache +/// For Use and Promote variants, parent hash is the second field #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum MoveBlock { - Use(Vec, Option), + Use(Vec), Destroy(Vec), Deref(Vec), - Promote(Uuid, GlobalHash), + Promote(Uuid, GlobalHash, Option), +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum MoveBlockResponse { + Store(Vec, Option), + Remove(Vec), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -101,6 +111,31 @@ impl MockEngineArgs { } } +/// Note: This assumes block_hash and tokens_hash are the same, which is not correct in rare cases +/// where the sequence-aware hash differs from the token content hash. +pub fn block_response_to_kv_event(response: MoveBlockResponse) -> KvCacheEventData { + match response { + MoveBlockResponse::Store(full_blocks, parent_hash) => { + KvCacheEventData::Stored(KvCacheStoreData { + parent_hash: parent_hash.map(ExternalSequenceBlockHash), + blocks: full_blocks + .into_iter() + .map(|block| KvCacheStoredBlockData { + block_hash: ExternalSequenceBlockHash(block), + tokens_hash: LocalBlockHash(block), + }) + .collect(), + }) + } + MoveBlockResponse::Remove(full_blocks) => KvCacheEventData::Removed(KvCacheRemoveData { + block_hashes: full_blocks + .into_iter() + .map(ExternalSequenceBlockHash) + .collect(), + }), + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 368d21f55b..a4ca6b830e 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -40,11 +40,13 @@ //! ## NOTE //! The current prefill and decoding time simulations are not scientific at all and are WIP -use crate::kv_router::protocols::ForwardPassMetrics; +use crate::kv_router::protocols::{ForwardPassMetrics, KvCacheEventData}; use crate::mocker::evictor::LRUEvictor; use crate::mocker::kv_manager::KvManager; -use crate::mocker::protocols::{DirectRequest, MockEngineArgs}; -use crate::mocker::protocols::{MoveBlock, OutputSignal, PrefillCost, UniqueBlock}; +use crate::mocker::protocols::{ + block_response_to_kv_event, MoveBlock, OutputSignal, PrefillCost, UniqueBlock, +}; +use crate::mocker::protocols::{DirectRequest, MockEngineArgs, MoveBlockResponse}; use crate::mocker::sequence::ActiveSequence; use std::collections::HashMap; use std::collections::VecDeque; @@ -204,12 +206,23 @@ impl Scheduler { args: MockEngineArgs, dp_rank: Option, output_tx: Option>, + kv_events_tx: Option>, cancellation_token: Option, ) -> Self { let state = Arc::new(Mutex::new(SchedulerState::default())); - let kv_manager = Arc::new(Mutex::new(KvManager::new( + + // Create internal channel for KV events only if needed + let (block_resp_tx, mut block_resp_rx) = if kv_events_tx.is_some() { + let (tx, rx) = mpsc::unbounded_channel::(); + (Some(tx), Some(rx)) + } else { + (None, None) + }; + + let kv_manager = Arc::new(Mutex::new(KvManager::new_with_sender( args.num_gpu_blocks, args.block_size, + block_resp_tx, ))); let hit_rates = Arc::new(Mutex::new(VecDeque::with_capacity(1000))); @@ -320,6 +333,13 @@ impl Scheduler { if !prefill_success { panic!("Block allocation for prefilling cannot fail."); } + + // Drain KV events and forward to relay after prefill signal processing + if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) { + while let Ok(event) = rx.try_recv() { + let _ = relay_tx.try_send(block_response_to_kv_event(event)); + } + } } // Process decoding @@ -341,6 +361,13 @@ impl Scheduler { continue; } + // Drain KV events and forward to relay after decode signal processing + if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) { + while let Ok(event) = rx.try_recv() { + let _ = relay_tx.try_send(block_response_to_kv_event(event)); + } + } + // Send UUID notification for each generated token if let Some(tx) = &output_tx_clone { let signal = OutputSignal { @@ -485,7 +512,7 @@ fn process_signals( } // Check we have a Use signal with blocks - let MoveBlock::Use(blocks, _) = signal else { + let MoveBlock::Use(blocks) = signal else { panic!("Failed signal is Invalid. Has to fail on generation signal."); }; @@ -536,7 +563,7 @@ mod tests { .unwrap(); // Create scheduler with new args struct - let scheduler = Scheduler::new(args, None, Some(output_tx), None); + let scheduler = Scheduler::new(args, None, Some(output_tx), None, None); // Create shared tokens for caching case let shared_tokens = if use_shared_tokens { @@ -649,7 +676,7 @@ mod tests { .unwrap(); // Create scheduler - let scheduler = Scheduler::new(args, None, Some(output_tx), None); + let scheduler = Scheduler::new(args, None, Some(output_tx), None, None); // Create identical tokens for all requests let identical_tokens: Vec = (0..token_length).map(|i| i as u32).collect(); diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index b79e79872a..17ef65d2c3 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -73,7 +73,7 @@ impl ActiveSequence { let tokens = Tokens::from(tokens).into_sequence(block_size, None); let unique_blocks = create_unique_blocks_from_sequence(&tokens, None, block_size); - let creation_signal = Some(MoveBlock::Use(unique_blocks.clone(), None)); + let creation_signal = Some(MoveBlock::Use(unique_blocks.clone())); Self { unique_blocks, @@ -109,6 +109,17 @@ impl ActiveSequence { (sequence, signal) } + /// Get the parent hash from the second-to-last block if it exists and is a FullBlock + fn get_parent_hash(&self) -> Option { + if self.unique_blocks.len() < 2 { + return None; + } + match &self.unique_blocks[self.unique_blocks.len() - 2] { + UniqueBlock::FullBlock(hash) => Some(*hash), + _ => panic!("Cannot have a partial block as parent"), + } + } + /// Push a token to the sequence pub fn push(&mut self, token: u32) -> Option> { self.tokens.append(token).expect("Token push failed."); @@ -128,12 +139,16 @@ impl ActiveSequence { self.unique_blocks.pop(); self.unique_blocks .push(UniqueBlock::FullBlock(last_block_hash)); - signals.push(MoveBlock::Promote(uuid, last_block_hash)); + signals.push(MoveBlock::Promote( + uuid, + last_block_hash, + self.get_parent_hash(), + )); } let new_partial_block = UniqueBlock::default(); self.unique_blocks.push(new_partial_block.clone()); - signals.push(MoveBlock::Use(vec![new_partial_block], None)); + signals.push(MoveBlock::Use(vec![new_partial_block])); Some(signals) } @@ -201,7 +216,7 @@ impl ActiveSequence { self.unique_blocks = create_unique_blocks_from_sequence(&self.tokens, None, self.block_size); self.generated_tokens = 0; - self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone(), None)); + self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone())); free_signal } @@ -233,7 +248,7 @@ mod tests { // Check that we got a Use signal assert!(signal1.is_some()); match &signal1 { - Some(MoveBlock::Use(blocks, _)) => { + Some(MoveBlock::Use(blocks)) => { assert_eq!(blocks.len(), 1); } _ => panic!("Expected Use signal"), @@ -252,24 +267,23 @@ mod tests { let signal_16 = signal_16.unwrap(); assert_eq!(signal_16.len(), 2); + // First signal should be Promote for the previous block + match &signal_16[0] { + MoveBlock::Promote(_, _, parent_hash) => { + assert_eq!(*parent_hash, None); + } + _ => panic!("Expected Promote signal as second signal"), + } + // Second signal should be Use for new partial block match &signal_16[1] { - MoveBlock::Use(blocks, _) => { + MoveBlock::Use(blocks) => { assert_eq!(blocks.len(), 1); assert!(matches!(blocks[0], UniqueBlock::PartialBlock(_))); } _ => panic!("Expected Use signal as first signal"), } - // First signal should be Promote for the previous block - match &signal_16[0] { - MoveBlock::Promote(uuid, _) => { - // The uuid is generated dynamically, so we just check it exists - let _ = uuid; - } - _ => panic!("Expected Promote signal as second signal"), - } - // Verify state after pushing tokens assert_eq!(seq1.unique_blocks().len(), 2); // One full block and one partial block assert_eq!(seq1.len(), 17); @@ -339,6 +353,32 @@ mod tests { "First two blocks should be identical" ); + // Push tokens 34..47 to seq1 + for token in 33..48 { + seq1.push(token); + } + + // Push token 47 and get the signal - this completes the block and triggers signals + let signal = seq1.push(48); + let signal = signal.unwrap(); + + // Check that signal[0] is promote + match &signal[0] { + MoveBlock::Promote(_, _, parent_hash) => { + // Check that the parent_hash matches unique_blocks[1], which should be a full block + if let UniqueBlock::FullBlock(expected_hash) = seq1.unique_blocks()[1] { + assert_eq!( + *parent_hash, + Some(expected_hash), + "Parent hash should match unique_blocks[1]" + ); + } else { + panic!("unique_blocks[1] should be a full block"); + } + } + _ => panic!("Expected Promote signal as first signal"), + } + // Reset seq1 and check that it equals the original clone let free_signals = seq1.reset_with_signal(); @@ -355,7 +395,7 @@ mod tests { // Initial signal - should have received a Use signal for the partial block assert!(signal.is_some()); match signal { - Some(MoveBlock::Use(blocks, _)) => { + Some(MoveBlock::Use(blocks)) => { assert_eq!(blocks.len(), 1); assert!(matches!(blocks[0], UniqueBlock::PartialBlock(_))); } @@ -371,25 +411,23 @@ mod tests { let signals_second = seq.generate(); assert_eq!(signals_second.len(), 2); - // First signal should be Use for new partial block + // First signal should be Promote + match &signals_second[0] { + MoveBlock::Promote(_, _, parent_hash) => { + assert_eq!(*parent_hash, None); + } + _ => panic!("Expected Promote signal as first signal after second token"), + } + + // Second signal should be Use for new partial block match &signals_second[1] { - MoveBlock::Use(blocks, _) => { + MoveBlock::Use(blocks) => { assert_eq!(blocks.len(), 1); assert!(matches!(blocks[0], UniqueBlock::PartialBlock(_))); } _ => panic!("Expected Use signal as second signal after second token"), } - // Second signal should be Promote - match &signals_second[0] { - MoveBlock::Promote(uuid, hash) => { - // The uuid and hash values are generated dynamically, so we just check the event type - let _ = uuid; - let _ = hash; - } - _ => panic!("Expected Promote signal as first signal after second token"), - } - // Generate fourth token - should not trigger new signals as it's adding to partial block let signals_third = seq.generate(); assert_eq!(signals_third.len(), 0); From e96f8103c037d7797899cb38773236b0e1432133 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Fri, 13 Jun 2025 02:59:31 -0700 Subject: [PATCH 10/36] move block resp test in kv manager --- lib/llm/src/mocker/kv_manager.rs | 91 ++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 4 deletions(-) diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index 37721db04e..9d061b8ed0 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -122,11 +122,13 @@ impl KvManager { MoveBlock::Use(hashes) => { let mut blocks_stored = Vec::::new(); + let mut parent_block: Option<&UniqueBlock> = None; for hash in hashes { // First check if it already exists in active blocks if let Some(ref_count) = self.active_blocks.get_mut(hash) { // Block already active, just increment reference count *ref_count += 1; + parent_block = Some(hash); continue; } @@ -134,6 +136,7 @@ impl KvManager { if self.inactive_blocks.remove(hash) { // Insert into active with reference count 1 self.active_blocks.insert(hash.clone(), 1); + parent_block = Some(hash); continue; } @@ -161,7 +164,13 @@ impl KvManager { } } } - self.send_block_response(blocks_stored, false, true, None); + + let parent_hash = match parent_block { + None => None, + Some(UniqueBlock::FullBlock(block)) => Some(*block), + Some(UniqueBlock::PartialBlock(_)) => panic!("parent block cannot be partial"), + }; + self.send_block_response(blocks_stored, false, true, parent_hash); } MoveBlock::Destroy(hashes) => { @@ -299,6 +308,7 @@ impl KvManager { #[cfg(test)] mod tests { use super::*; + use tokio::sync::mpsc; #[test] fn test_failure_on_max_capacity() { @@ -327,10 +337,12 @@ mod tests { } #[test] - // This is taken directly from the example in the vllm v1 prefix caching docs fn test_block_lifecycle_stringent() { - // Create a KvManager with 10 blocks capacity - let mut manager = KvManager::new(10, 16); + // Create a channel to listen to block responses + let (tx, mut rx) = mpsc::unbounded_channel::(); + + // Create a KvManager with 10 blocks capacity and the response sender + let mut manager = KvManager::new_with_sender(10, 16, Some(tx)); // Helper function to use multiple blocks fn use_blocks(manager: &mut KvManager, ids: Vec) { @@ -350,6 +362,65 @@ mod tests { manager.process(&MoveBlock::Deref(blocks)); } + // Helper function to assert block responses + fn assert_block_response( + rx: &mut mpsc::UnboundedReceiver, + expected_type: &str, + expected_blocks: Vec, + description: &str, + ) { + let response = rx + .try_recv() + .unwrap_or_else(|_| panic!("Expected {} response {}", expected_type, description)); + + match (&response, expected_type) { + (MoveBlockResponse::Store(blocks, _parent_hash), "Store") => { + assert_eq!( + blocks.len(), + expected_blocks.len(), + "Expected {} blocks in Store response {}", + expected_blocks.len(), + description + ); + assert_eq!( + *blocks, expected_blocks, + "Store blocks don't match expected {}", + description + ); + } + (MoveBlockResponse::Remove(blocks), "Remove") => { + assert_eq!( + blocks.len(), + expected_blocks.len(), + "Expected {} blocks in Remove response {}", + expected_blocks.len(), + description + ); + assert_eq!( + *blocks, expected_blocks, + "Remove blocks don't match expected {}", + description + ); + } + _ => panic!( + "Expected {} response, got {:?} {}", + expected_type, response, description + ), + } + } + + // Helper function to assert no response is received + fn assert_no_response( + rx: &mut mpsc::UnboundedReceiver, + description: &str, + ) { + assert!( + rx.try_recv().is_err(), + "Expected no response {}", + description + ); + } + // Helper function to check if active blocks contain expected blocks with expected ref counts fn assert_active_blocks(manager: &KvManager, expected_blocks: &[(u64, usize)]) { assert_eq!( @@ -400,9 +471,11 @@ mod tests { // First use blocks 0, 1, 2, 3, 4 in a batch use_blocks(&mut manager, (0..5).collect()); + assert_block_response(&mut rx, "Store", vec![0, 1, 2, 3, 4], "after first use"); // Then use blocks 0, 1, 5, 6 in a batch use_blocks(&mut manager, vec![0, 1, 5, 6]); + assert_block_response(&mut rx, "Store", vec![5, 6], "after second use"); // Check that the blocks 0 and 1 are in active blocks, both with reference counts of 2 assert_active_blocks( @@ -412,9 +485,11 @@ mod tests { // Now destroy block 4 destroy_blocks(&mut manager, vec![4]); + assert_block_response(&mut rx, "Remove", vec![4], "after destroy block 4"); // And deref blocks 3, 2, 1, 0 in this order as a batch deref_blocks(&mut manager, vec![0, 1, 2, 3]); + assert_no_response(&mut rx, "after deref operation"); // Check that the inactive_blocks is size 2 (via num_objects) and contains 3 and 2 assert_inactive_blocks(&manager, 2, &[3, 2]); @@ -422,6 +497,7 @@ mod tests { // Now destroy block 6 destroy_blocks(&mut manager, vec![6]); + assert_block_response(&mut rx, "Remove", vec![6], "after block 6 eviction"); // And deref blocks 5, 1, 0 as a batch deref_blocks(&mut manager, vec![0, 1, 5]); @@ -432,6 +508,7 @@ mod tests { // Now use 0, 1, 2, 7, 8, 9 as a batch use_blocks(&mut manager, vec![0, 1, 2, 7, 8, 9]); + assert_block_response(&mut rx, "Store", vec![7, 8, 9], "after [7, 8, 9] use"); // Check that the inactive_blocks is size 2, and contains 3 and 5 assert_inactive_blocks(&manager, 2, &[3, 5]); @@ -446,8 +523,14 @@ mod tests { // Now use blocks 10, 11, 12 as a batch use_blocks(&mut manager, vec![10, 11, 12]); + assert_block_response(&mut rx, "Remove", vec![3], "after block 5 eviction"); + assert_block_response(&mut rx, "Store", vec![10, 11, 12], "after [10, 11, 12] use"); // Check that the inactive_blocks is size 1 and contains only 5 assert_inactive_blocks(&manager, 1, &[5]); + + use_blocks(&mut manager, vec![13]); + assert_block_response(&mut rx, "Remove", vec![5], "after block 5 eviction"); + assert_block_response(&mut rx, "Store", vec![13], "after block 13 use"); } } From c09f007067ded64c62bf7d68b37693b35832e38b Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sat, 14 Jun 2025 02:50:20 -0700 Subject: [PATCH 11/36] basic test passes for both load metrics and kv events --- lib/llm/src/mocker/engine.rs | 368 +++++++++++++++++++++++++++----- lib/llm/src/mocker/scheduler.rs | 22 +- 2 files changed, 324 insertions(+), 66 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 914a6c1ad0..ca6eb968de 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -28,9 +28,12 @@ use dynamo_runtime::{ engine::AsyncEngineContextProvider, pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn}, protocols::annotated::Annotated, + traits::DistributedRuntimeProvider, Result, }; +use crate::kv_router::protocols::{KvCacheEvent, KvCacheEventData}; +use crate::kv_router::publisher::KvEventPublisher; use rand::Rng; use std::collections::HashMap; use std::sync::Arc; @@ -72,12 +75,21 @@ impl MockVllmEngine { let cancel_token = cancel_token.unwrap_or_default(); - // Create schedulers and start their background tasks - let schedulers = + // Create schedulers and get their KV event receivers + let (schedulers, kv_event_receivers) = Self::start_schedulers(args.clone(), active_requests.clone(), cancel_token.clone()); - // Start metrics publishing tasks - Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?; + Self::start_metrics_publishing(&schedulers, component.clone(), cancel_token.clone()) + .await?; + + // Start KV events publishing with the actual receivers from schedulers + Self::start_kv_events_publishing( + kv_event_receivers, + component.clone(), + args.block_size, + cancel_token.clone(), + ) + .await?; let engine = Self { schedulers, @@ -90,27 +102,36 @@ impl MockVllmEngine { } /// Create schedulers and spawn their background tasks for distributing token notifications + /// Returns schedulers and their corresponding KV event receivers fn start_schedulers( args: MockEngineArgs, active_requests: Arc>>>, cancel_token: CancellationToken, - ) -> Vec { + ) -> ( + Vec, + Vec>, + ) { let mut schedulers = Vec::new(); + let mut kv_event_receivers = Vec::new(); // Create multiple schedulers and their background tasks for dp_rank in 0..args.dp_size { // Create a shared output channel that this scheduler will use - let (output_tx, output_rx) = mpsc::channel::(1024); + let (output_tx, output_rx) = mpsc::unbounded_channel::(); + + // Create a channel for KV events from this scheduler + let (kv_events_tx, kv_events_rx) = mpsc::unbounded_channel::(); let scheduler = Scheduler::new( args.clone(), Some(dp_rank), Some(output_tx), - None, + Some(kv_events_tx), // Pass the KV events sender to scheduler Some(cancel_token.clone()), ); schedulers.push(scheduler); + kv_event_receivers.push(kv_events_rx); // Spawn a background task for this scheduler to distribute token notifications to active requests let output_rx = Arc::new(Mutex::new(output_rx)); @@ -142,7 +163,7 @@ impl MockVllmEngine { }); } - schedulers + (schedulers, kv_event_receivers) } /// Start background tasks to poll and publish metrics every second @@ -151,12 +172,27 @@ impl MockVllmEngine { component: Option, cancel_token: CancellationToken, ) -> Result<()> { + println!("🔧 Creating metrics publisher..."); let metrics_publisher = Arc::new(WorkerMetricsPublisher::new()?); + println!("✓ Metrics publisher created"); if let Some(comp) = component { - metrics_publisher.create_endpoint(comp).await?; + println!("🔧 Creating metrics endpoint..."); + tokio::spawn({ + let publisher = metrics_publisher.clone(); + async move { + if let Err(e) = publisher.create_endpoint(comp.clone()).await { + println!("Metrics endpoint failed: {}", e); + } + } + }); + + // Give it a moment to start + tokio::time::sleep(Duration::from_millis(100)).await; + println!("✓ Metrics endpoint started (background)"); } + println!("🔧 Starting metrics background tasks..."); for (dp_rank, scheduler) in schedulers.iter().enumerate() { let scheduler = scheduler.clone(); let publisher = metrics_publisher.clone(); @@ -164,7 +200,7 @@ impl MockVllmEngine { let cancel_token = cancel_token.clone(); tokio::spawn(async move { - let mut interval = interval(Duration::from_secs(1)); + let mut interval = interval(Duration::from_millis(100)); loop { tokio::select! { @@ -187,6 +223,82 @@ impl MockVllmEngine { } }); } + println!("✓ Metrics background tasks started"); + Ok(()) + } + + /// Start background tasks to collect and publish KV events from schedulers + async fn start_kv_events_publishing( + kv_event_receivers: Vec>, + component: Option, + block_size: usize, + cancel_token: CancellationToken, + ) -> Result<()> { + println!("🔧 Starting KV events publishing..."); + + // Only start KV events publishing if we have a component + let Some(comp) = component else { + println!("⚠️ No component provided, skipping KV events publishing"); + return Ok(()); + }; + println!("✓ Component found for KV events publishing"); + + println!("🔧 Getting worker_id..."); + let worker_id = comp + .drt() + .primary_lease() + .expect("Cannot publish KV events without lease") // ← This will PANIC on static! + .id(); + // let worker_id = 0; + println!("✓ Worker_id set to: {}", worker_id); + + println!("🔧 Creating KV event publisher..."); + let kv_event_publisher = Arc::new(KvEventPublisher::new( + comp.clone(), + worker_id, + block_size, + None, + )?); + println!("✓ KV event publisher created"); + + println!( + "🔧 Starting KV event background tasks for {} receivers...", + kv_event_receivers.len() + ); + for (dp_rank, mut kv_events_rx) in kv_event_receivers.into_iter().enumerate() { + println!("🔧 Starting background task for DP rank {}", dp_rank); + let publisher = kv_event_publisher.clone(); + let dp_rank = dp_rank as u32; + let cancel_token = cancel_token.clone(); + + tokio::spawn(async move { + println!("✓ Background task started for DP rank {}", dp_rank); + loop { + tokio::select! { + // Receive actual KV events from the scheduler + Some(event_data) = kv_events_rx.recv() => { + // Convert KvCacheEventData to KvCacheEvent with random UUID as event_id + let event = KvCacheEvent { + event_id: Uuid::new_v4().as_u128() as u64, + data: event_data, + }; + + // Publish the event + if let Err(e) = publisher.publish(event) { + tracing::warn!("Failed to publish KV event for DP rank {}: {}", dp_rank, e); + } else { + tracing::trace!("Published KV event for DP rank {}", dp_rank); + } + } + _ = cancel_token.cancelled() => { + tracing::info!("KV events publishing cancelled for DP rank {}", dp_rank); + break; + } + } + } + }); + } + println!("✓ All KV event background tasks started"); Ok(()) } @@ -267,77 +379,223 @@ impl AsyncEngine, ManyOut>, Error> for } #[cfg(test)] -mod tests { +mod integration_tests { use super::*; - use dynamo_runtime::pipeline::Context; + use crate::kv_router::indexer::RouterEvent; + use crate::kv_router::KV_EVENT_SUBJECT; + use dynamo_runtime::{ + pipeline::Context, + pipeline::{network::Ingress, PushRouter}, + traits::events::EventSubscriber, + DistributedRuntime, Worker, + }; use futures::StreamExt; + use tokio::time::timeout; #[tokio::test] - async fn test_multiple_workers_with_token_limit() { + #[ignore] // Run with: cargo test -- --ignored + async fn test_mock_vllm_engine_full_integration() -> Result<()> { const DP_SIZE: u32 = 2; const TOKENS_PER_REQUEST: usize = 20; - - // Create the MockVllmEngine using builder pattern + const BLOCK_SIZE: usize = 2; + + // Create runtime and distributed runtime + let worker = Worker::from_settings()?; + let runtime = worker.runtime(); + let distributed = DistributedRuntime::from_settings(runtime.clone()).await?; + println!("✓ Runtime and distributed runtime created"); + + // Create component for MockVllmEngine (needed for publishers) + let test_component = distributed + .namespace("test")? + .component("mock-vllm")? + .service_builder() + .create() + .await?; + println!("✓ Test component created"); + + // Create MockVllmEngine WITH component (enables publishers) let args = MockEngineArgs::builder() .speedup_ratio(10.0) .dp_size(DP_SIZE) + .block_size(BLOCK_SIZE) .build() .unwrap(); - let engine = MockVllmEngine::new(args, None, None).await.unwrap(); + let engine = Arc::new(MockVllmEngine::new(args, Some(test_component.clone()), None).await?); + println!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE); + + // Set up KV events subscriber + let mut kv_events_subscriber = test_component.subscribe(KV_EVENT_SUBJECT).await?; + println!("✓ KV events subscriber created"); + + // Wrap with Ingress and register with component/endpoint + let ingress = Ingress::for_engine(engine)?; + println!("✓ Ingress wrapper created"); + + // Start the server in background + let server_handle = tokio::spawn({ + let test_component = test_component.clone(); + async move { + if let Err(e) = test_component + .endpoint("generate") + .endpoint_builder() + .handler(ingress) + .start() + .await + { + eprintln!("❌ Generate endpoint failed: {}", e); + } + } + }); + println!("✓ Server started in background"); + + // Give server time to start + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + println!("✓ Server startup delay completed"); + + // Print all registered instances from etcd + match test_component.list_instances().await { + Ok(instances) => { + println!("📋 Found {} registered instances:", instances.len()); + for instance in instances { + println!( + " • {}/{}/{} (ID: {})", + instance.namespace, + instance.component, + instance.endpoint, + instance.instance_id + ); + } + } + Err(e) => { + println!("❌ Failed to list instances: {}", e); + } + } + + // Create client + let client = distributed + .namespace("test")? + .component("mock-vllm")? + .endpoint("generate") + .client() + .await?; + println!("✓ Client created"); + + let router = PushRouter::from_client(client, Default::default()).await?; + println!("✓ Router created"); + + // Create test requests for both DP workers + let create_request = |tokens: Vec, dp_rank: u32| DirectRequest { + tokens, + max_output_tokens: TOKENS_PER_REQUEST, + uuid: None, + dp_rank: Some(dp_rank), + }; - // Create 4 DirectRequests: 2 for worker 0, 2 for worker 1 let requests = vec![ - DirectRequest { - tokens: vec![1, 2, 3, 4], - max_output_tokens: TOKENS_PER_REQUEST, - uuid: None, - dp_rank: Some(0), - }, - DirectRequest { - tokens: vec![5, 6, 7, 8], - max_output_tokens: TOKENS_PER_REQUEST, - uuid: None, - dp_rank: Some(0), - }, - DirectRequest { - tokens: vec![9, 10, 11, 12], - max_output_tokens: TOKENS_PER_REQUEST, - uuid: None, - dp_rank: Some(1), - }, - DirectRequest { - tokens: vec![13, 14, 15, 16], - max_output_tokens: TOKENS_PER_REQUEST, - uuid: None, - dp_rank: Some(1), - }, + create_request(vec![1, 2, 3, 4, 5], 0), + create_request(vec![1, 2, 3, 4, 5], 0), + create_request(vec![1, 2, 3, 4, 5], 1), + create_request(vec![1, 2, 3, 4, 5], 1), ]; + println!( + "✓ Test requests created ({} requests total)", + requests.len() + ); - // Generate streams and collect all tokens from each - for request in requests { - let ctx = Context::new(request); - let stream = engine.generate(ctx).await.unwrap(); + // Test each request + for (i, request) in requests.into_iter().enumerate() { + println!("Testing request {}", i + 1); - let tokens: Vec<_> = stream.collect().await; + let response_stream = router.generate(Context::new(request)).await?; + let responses: Vec> = response_stream.collect().await; // Verify each stream produces exactly the expected number of tokens - assert_eq!(tokens.len(), TOKENS_PER_REQUEST); + assert_eq!( + responses.len(), + TOKENS_PER_REQUEST, + "Request {} should produce {} tokens, got {}", + i + 1, + TOKENS_PER_REQUEST, + responses.len() + ); // Verify all tokens contain valid data - for token in tokens { - assert!(token.data.is_some()); + for (j, token) in responses.iter().enumerate() { + if let Some(char_data) = &token.data { + assert!( + !char_data.is_empty(), + "Request {} token {} should not be empty", + i + 1, + j + 1 + ); + } else { + panic!("Request {} token {} should have data", i + 1, j + 1); + } + } + + println!( + "✓ Request {} completed successfully with {} tokens", + i + 1, + responses.len() + ); + } + + println!("🎉 All requests completed successfully!"); + + // Try to receive at least one KV event with 100ms timeout + println!("Waiting for KV event with 100ms timeout..."); + let msg = timeout(Duration::from_millis(100), kv_events_subscriber.next()) + .await + .map_err(|_| Error::msg("Timeout waiting for KV event"))? + .ok_or_else(|| Error::msg("KV events stream ended unexpectedly"))?; + + match serde_json::from_slice::(&msg.payload) { + Ok(event) => { + println!("✓ Received KV event: {:?}", event); + } + Err(e) => { + return Err(Error::msg(format!("Failed to deserialize KV event: {}", e))); } } - // Give a small delay to ensure cleanup tasks complete - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + // Use KvMetricsAggregator to get metrics more easily + let cancel_token = test_component.drt().runtime().child_token(); + let metrics_aggregator = crate::kv_router::metrics_aggregator::KvMetricsAggregator::new( + test_component.clone(), + cancel_token, + ) + .await; + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + + let processed_endpoints = metrics_aggregator.get_endpoints(); + println!( + "Found {} metrics endpoints", + processed_endpoints.endpoints.len() + ); - // Verify that active_requests is empty (all requests cleaned up) - let active_requests = engine.active_requests.lock().await; + // Verify we found at least one metrics endpoint assert!( - active_requests.is_empty(), - "Active requests should be empty after streams complete" + !processed_endpoints.endpoints.is_empty(), + "Should find at least one metrics endpoint" + ); + println!( + "✓ Successfully found {} metrics endpoints", + processed_endpoints.endpoints.len() ); + + // Verify the metrics endpoints contain valid data + for (worker_id, endpoint) in &processed_endpoints.endpoints { + println!("✓ Worker {} metrics: {:?}", worker_id, endpoint.data); + } + + println!("🎉 Event verification completed!"); + + // Cleanup + distributed.shutdown(); + server_handle.await?; + + Ok(()) } } diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index a4ca6b830e..255e2380fb 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -196,7 +196,7 @@ pub struct Scheduler { dp_rank: Option, state: Arc>, kv_manager: Arc>, - request_tx: mpsc::Sender, + request_tx: mpsc::UnboundedSender, hit_rates: Arc>>, } @@ -205,8 +205,8 @@ impl Scheduler { pub fn new( args: MockEngineArgs, dp_rank: Option, - output_tx: Option>, - kv_events_tx: Option>, + output_tx: Option>, + kv_events_tx: Option>, cancellation_token: Option, ) -> Self { let state = Arc::new(Mutex::new(SchedulerState::default())); @@ -234,7 +234,7 @@ impl Scheduler { ); // Create channel for request handling - let (request_tx, mut request_rx) = mpsc::channel::(1024); + let (request_tx, mut request_rx) = mpsc::unbounded_channel::(); // Create a clone for the background task let state_clone = state.clone(); @@ -337,7 +337,7 @@ impl Scheduler { // Drain KV events and forward to relay after prefill signal processing if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) { while let Ok(event) = rx.try_recv() { - let _ = relay_tx.try_send(block_response_to_kv_event(event)); + let _ = relay_tx.send(block_response_to_kv_event(event)); } } } @@ -364,7 +364,7 @@ impl Scheduler { // Drain KV events and forward to relay after decode signal processing if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) { while let Ok(event) = rx.try_recv() { - let _ = relay_tx.try_send(block_response_to_kv_event(event)); + let _ = relay_tx.send(block_response_to_kv_event(event)); } } @@ -374,7 +374,7 @@ impl Scheduler { uuid, completed: false, }; - let _ = tx.try_send(signal); + let _ = tx.send(signal); } // Check if we're done after generating @@ -384,7 +384,7 @@ impl Scheduler { uuid, completed: true, }; - let _ = tx.try_send(signal); + let _ = tx.send(signal); } state_guard.complete(&uuid); continue; @@ -414,7 +414,7 @@ impl Scheduler { /// Add a new request to the waiting queue pub async fn receive(&self, request: DirectRequest) { - let _ = self.request_tx.send(request).await; + let _ = self.request_tx.send(request); } /// Get the count of waiting requests @@ -552,7 +552,7 @@ mod tests { let max_output_tokens: usize = 100; // Create channel for token output - let (output_tx, mut output_rx) = mpsc::channel::(1024); + let (output_tx, mut output_rx) = mpsc::unbounded_channel::(); // Create scheduler args using builder let args = MockEngineArgs::builder() @@ -665,7 +665,7 @@ mod tests { let token_length = 65; // Create channel for token output - let (output_tx, mut output_rx) = mpsc::channel::(1024); + let (output_tx, mut output_rx) = mpsc::unbounded_channel::(); // Create scheduler args let args = MockEngineArgs::builder() From 4502e5e6a364b067af957965513420c7f2514f6f Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sat, 14 Jun 2025 02:56:41 -0700 Subject: [PATCH 12/36] better tracing --- lib/llm/src/mocker/engine.rs | 82 ++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index ca6eb968de..34f137d661 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -172,27 +172,27 @@ impl MockVllmEngine { component: Option, cancel_token: CancellationToken, ) -> Result<()> { - println!("🔧 Creating metrics publisher..."); + tracing::info!("Creating metrics publisher"); let metrics_publisher = Arc::new(WorkerMetricsPublisher::new()?); - println!("✓ Metrics publisher created"); + tracing::info!("Metrics publisher created"); if let Some(comp) = component { - println!("🔧 Creating metrics endpoint..."); + tracing::info!("Creating metrics endpoint"); tokio::spawn({ let publisher = metrics_publisher.clone(); async move { if let Err(e) = publisher.create_endpoint(comp.clone()).await { - println!("Metrics endpoint failed: {}", e); + tracing::error!("Metrics endpoint failed: {}", e); } } }); // Give it a moment to start tokio::time::sleep(Duration::from_millis(100)).await; - println!("✓ Metrics endpoint started (background)"); + tracing::info!("Metrics endpoint started (background)"); } - println!("🔧 Starting metrics background tasks..."); + tracing::info!("Starting metrics background tasks"); for (dp_rank, scheduler) in schedulers.iter().enumerate() { let scheduler = scheduler.clone(); let publisher = metrics_publisher.clone(); @@ -223,7 +223,7 @@ impl MockVllmEngine { } }); } - println!("✓ Metrics background tasks started"); + tracing::info!("Metrics background tasks started"); Ok(()) } @@ -234,45 +234,45 @@ impl MockVllmEngine { block_size: usize, cancel_token: CancellationToken, ) -> Result<()> { - println!("🔧 Starting KV events publishing..."); + tracing::info!("Starting KV events publishing"); // Only start KV events publishing if we have a component let Some(comp) = component else { - println!("⚠️ No component provided, skipping KV events publishing"); + tracing::warn!("No component provided, skipping KV events publishing"); return Ok(()); }; - println!("✓ Component found for KV events publishing"); + tracing::info!("Component found for KV events publishing"); - println!("🔧 Getting worker_id..."); + tracing::debug!("Getting worker_id"); let worker_id = comp .drt() .primary_lease() .expect("Cannot publish KV events without lease") // ← This will PANIC on static! .id(); // let worker_id = 0; - println!("✓ Worker_id set to: {}", worker_id); + tracing::debug!("Worker_id set to: {}", worker_id); - println!("🔧 Creating KV event publisher..."); + tracing::info!("Creating KV event publisher"); let kv_event_publisher = Arc::new(KvEventPublisher::new( comp.clone(), worker_id, block_size, None, )?); - println!("✓ KV event publisher created"); + tracing::info!("KV event publisher created"); - println!( - "🔧 Starting KV event background tasks for {} receivers...", + tracing::info!( + "Starting KV event background tasks for {} receivers", kv_event_receivers.len() ); for (dp_rank, mut kv_events_rx) in kv_event_receivers.into_iter().enumerate() { - println!("🔧 Starting background task for DP rank {}", dp_rank); + tracing::debug!("Starting background task for DP rank {}", dp_rank); let publisher = kv_event_publisher.clone(); let dp_rank = dp_rank as u32; let cancel_token = cancel_token.clone(); tokio::spawn(async move { - println!("✓ Background task started for DP rank {}", dp_rank); + tracing::debug!("Background task started for DP rank {}", dp_rank); loop { tokio::select! { // Receive actual KV events from the scheduler @@ -298,7 +298,7 @@ impl MockVllmEngine { } }); } - println!("✓ All KV event background tasks started"); + tracing::info!("All KV event background tasks started"); Ok(()) } @@ -403,7 +403,7 @@ mod integration_tests { let worker = Worker::from_settings()?; let runtime = worker.runtime(); let distributed = DistributedRuntime::from_settings(runtime.clone()).await?; - println!("✓ Runtime and distributed runtime created"); + tracing::info!("✓ Runtime and distributed runtime created"); // Create component for MockVllmEngine (needed for publishers) let test_component = distributed @@ -412,7 +412,7 @@ mod integration_tests { .service_builder() .create() .await?; - println!("✓ Test component created"); + tracing::info!("✓ Test component created"); // Create MockVllmEngine WITH component (enables publishers) let args = MockEngineArgs::builder() @@ -423,15 +423,15 @@ mod integration_tests { .unwrap(); let engine = Arc::new(MockVllmEngine::new(args, Some(test_component.clone()), None).await?); - println!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE); + tracing::info!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE); // Set up KV events subscriber let mut kv_events_subscriber = test_component.subscribe(KV_EVENT_SUBJECT).await?; - println!("✓ KV events subscriber created"); + tracing::info!("✓ KV events subscriber created"); // Wrap with Ingress and register with component/endpoint let ingress = Ingress::for_engine(engine)?; - println!("✓ Ingress wrapper created"); + tracing::info!("✓ Ingress wrapper created"); // Start the server in background let server_handle = tokio::spawn({ @@ -448,18 +448,18 @@ mod integration_tests { } } }); - println!("✓ Server started in background"); + tracing::info!("✓ Server started in background"); // Give server time to start tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; - println!("✓ Server startup delay completed"); + tracing::info!("✓ Server startup delay completed"); // Print all registered instances from etcd match test_component.list_instances().await { Ok(instances) => { - println!("📋 Found {} registered instances:", instances.len()); + tracing::info!("📋 Found {} registered instances:", instances.len()); for instance in instances { - println!( + tracing::info!( " • {}/{}/{} (ID: {})", instance.namespace, instance.component, @@ -469,7 +469,7 @@ mod integration_tests { } } Err(e) => { - println!("❌ Failed to list instances: {}", e); + tracing::error!("❌ Failed to list instances: {}", e); } } @@ -480,10 +480,10 @@ mod integration_tests { .endpoint("generate") .client() .await?; - println!("✓ Client created"); + tracing::info!("✓ Client created"); let router = PushRouter::from_client(client, Default::default()).await?; - println!("✓ Router created"); + tracing::info!("✓ Router created"); // Create test requests for both DP workers let create_request = |tokens: Vec, dp_rank: u32| DirectRequest { @@ -499,14 +499,14 @@ mod integration_tests { create_request(vec![1, 2, 3, 4, 5], 1), create_request(vec![1, 2, 3, 4, 5], 1), ]; - println!( + tracing::info!( "✓ Test requests created ({} requests total)", requests.len() ); // Test each request for (i, request) in requests.into_iter().enumerate() { - println!("Testing request {}", i + 1); + tracing::info!("Testing request {}", i + 1); let response_stream = router.generate(Context::new(request)).await?; let responses: Vec> = response_stream.collect().await; @@ -535,17 +535,17 @@ mod integration_tests { } } - println!( + tracing::info!( "✓ Request {} completed successfully with {} tokens", i + 1, responses.len() ); } - println!("🎉 All requests completed successfully!"); + tracing::info!("🎉 All requests completed successfully!"); // Try to receive at least one KV event with 100ms timeout - println!("Waiting for KV event with 100ms timeout..."); + tracing::info!("Waiting for KV event with 100ms timeout..."); let msg = timeout(Duration::from_millis(100), kv_events_subscriber.next()) .await .map_err(|_| Error::msg("Timeout waiting for KV event"))? @@ -553,7 +553,7 @@ mod integration_tests { match serde_json::from_slice::(&msg.payload) { Ok(event) => { - println!("✓ Received KV event: {:?}", event); + tracing::info!("✓ Received KV event: {:?}", event); } Err(e) => { return Err(Error::msg(format!("Failed to deserialize KV event: {}", e))); @@ -570,7 +570,7 @@ mod integration_tests { tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; let processed_endpoints = metrics_aggregator.get_endpoints(); - println!( + tracing::info!( "Found {} metrics endpoints", processed_endpoints.endpoints.len() ); @@ -580,17 +580,17 @@ mod integration_tests { !processed_endpoints.endpoints.is_empty(), "Should find at least one metrics endpoint" ); - println!( + tracing::info!( "✓ Successfully found {} metrics endpoints", processed_endpoints.endpoints.len() ); // Verify the metrics endpoints contain valid data for (worker_id, endpoint) in &processed_endpoints.endpoints { - println!("✓ Worker {} metrics: {:?}", worker_id, endpoint.data); + tracing::info!("✓ Worker {} metrics: {:?}", worker_id, endpoint.data); } - println!("🎉 Event verification completed!"); + tracing::info!("🎉 Event verification completed!"); // Cleanup distributed.shutdown(); From fe20aa301566b161551d8c453570ef30303d6c04 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 16 Jun 2025 16:47:07 -0700 Subject: [PATCH 13/36] async engine core --- lib/llm/src/mocker/engine.rs | 176 ++++++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 56 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 34f137d661..c7405d95e0 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -19,15 +19,17 @@ //! to provide streaming token generation with realistic timing simulation. use crate::kv_router::publisher::WorkerMetricsPublisher; -use crate::mocker::protocols::{DirectRequest, MockEngineArgs, OutputSignal}; +use crate::mocker::protocols::DirectRequest; +use crate::mocker::protocols::{MockEngineArgs, OutputSignal}; use crate::mocker::scheduler::Scheduler; +use crate::protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest}; +use crate::protocols::TokenIdType; use tokio_util::sync::CancellationToken; use dynamo_runtime::{ component::Component, engine::AsyncEngineContextProvider, pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn}, - protocols::annotated::Annotated, traits::DistributedRuntimeProvider, Result, }; @@ -42,22 +44,16 @@ use tokio::time::{interval, Duration}; use tokio_stream::wrappers::ReceiverStream; use uuid::Uuid; -/// Generate a random printable character -fn generate_random_char() -> String { +/// Generate a random token ID from 0 to 50k +fn generate_random_token() -> TokenIdType { let mut rng = rand::rng(); - let selection = match rng.random_range(0..4) { - 0 => ('a'..='z').nth(rng.random_range(0..26)).unwrap(), // lowercase - 1 => ('A'..='Z').nth(rng.random_range(0..26)).unwrap(), // uppercase - 2 => ('0'..='9').nth(rng.random_range(0..10)).unwrap(), // digits - _ => [' ', '.', ',', '!', '?'][rng.random_range(0..5)], // punctuation/space - }; - selection.to_string() + rng.random_range(1..50000) } /// AsyncEngine wrapper around the Scheduler that generates random character tokens pub struct MockVllmEngine { schedulers: Vec, - active_requests: Arc>>>, + active_requests: Arc>>>, dp_size: u32, cancel_token: CancellationToken, } @@ -69,9 +65,10 @@ impl MockVllmEngine { component: Option, cancel_token: Option, ) -> Result { - let active_requests = Arc::new(Mutex::new( - HashMap::>::new(), - )); + let active_requests = Arc::new(Mutex::new(HashMap::< + Uuid, + mpsc::UnboundedSender, + >::new())); let cancel_token = cancel_token.unwrap_or_default(); @@ -105,7 +102,7 @@ impl MockVllmEngine { /// Returns schedulers and their corresponding KV event receivers fn start_schedulers( args: MockEngineArgs, - active_requests: Arc>>>, + active_requests: Arc>>>, cancel_token: CancellationToken, ) -> ( Vec, @@ -152,7 +149,7 @@ impl MockVllmEngine { // Notify the specific request that a token was generated let active = active_requests_clone.lock().await; if let Some(request_tx) = active.get(&signal.uuid) { - let _ = request_tx.send(signal).await; + let _ = request_tx.send(signal); } } _ = cancel_token_cloned.cancelled() => { @@ -305,14 +302,27 @@ impl MockVllmEngine { } #[async_trait] -impl AsyncEngine, ManyOut>, Error> for MockVllmEngine { +impl AsyncEngine, ManyOut, Error> + for MockVllmEngine +{ async fn generate( &self, - input: SingleIn, - ) -> Result>, Error> { - let (mut request, ctx) = input.into_parts(); - - let dp_rank = request.dp_rank.unwrap_or(0); + input: SingleIn, + ) -> Result, Error> { + let (request, ctx) = input.into_parts(); + + // Extract dp_rank from annotations if present + let dp_rank = request + .annotations + .iter() + .find_map(|ann| { + if ann.starts_with("dp_rank:") { + ann.strip_prefix("dp_rank:").and_then(|s| s.parse().ok()) + } else { + None + } + }) + .unwrap_or(0); // Validate dp_rank if dp_rank >= self.dp_size { @@ -323,9 +333,20 @@ impl AsyncEngine, ManyOut>, Error> for } let request_uuid = ctx.id().parse().unwrap_or(Uuid::new_v4()); - request.uuid = Some(request_uuid); - let (request_tx, mut request_rx) = mpsc::channel::(64); + // Convert PreprocessedRequest to DirectRequest for scheduler + let direct_request = DirectRequest { + tokens: request.token_ids.clone(), + max_output_tokens: request + .stop_conditions + .max_tokens + .expect("max_output_tokens must be specified for mocker") + as usize, + uuid: Some(request_uuid), + dp_rank: Some(dp_rank), + }; + + let (request_tx, mut request_rx) = mpsc::unbounded_channel::(); { let mut active = self.active_requests.lock().await; active.insert(request_uuid, request_tx); @@ -333,35 +354,61 @@ impl AsyncEngine, ManyOut>, Error> for // Send the request to the appropriate scheduler based on dp_rank self.schedulers[dp_rank as usize] - .receive(request.clone()) + .receive(direct_request) .await; // Create a simple channel for the stream - let (stream_tx, stream_rx) = mpsc::channel::>(64); + let (stream_tx, stream_rx) = mpsc::channel::(64); let active_requests = self.active_requests.clone(); let async_context = ctx.context(); let cancel_token = self.cancel_token.clone(); + let max_tokens = request.stop_conditions.max_tokens.unwrap_or(100) as usize; // Spawn a task to handle the complex async logic tokio::spawn(async move { + let mut token_count = 0; + loop { tokio::select! { Some(signal) = request_rx.recv() => { - if signal.completed { + if signal.completed || token_count >= max_tokens { + // Send final output with finish reason + let final_output = if token_count >= max_tokens { + LLMEngineOutput::length() + } else { + LLMEngineOutput::stop() + }; + + let _ = stream_tx.send(final_output).await; break; } - let output = generate_random_char(); - if stream_tx.send(Annotated::from_data(output)).await.is_err() { + + // Generate a new token + let token_id = generate_random_token(); + token_count += 1; + + let output = LLMEngineOutput { + token_ids: vec![token_id], + tokens: None, // Let backend handle detokenization + text: None, + cum_log_probs: None, + log_probs: None, + finish_reason: None, + }; + + if stream_tx.send(output).await.is_err() { break; } } _ = async_context.stopped() => { + let _ = stream_tx.send(LLMEngineOutput::cancelled()).await; break; } _ = cancel_token.cancelled() => { + let _ = stream_tx.send(LLMEngineOutput::cancelled()).await; break; } } @@ -383,6 +430,7 @@ mod integration_tests { use super::*; use crate::kv_router::indexer::RouterEvent; use crate::kv_router::KV_EVENT_SUBJECT; + use crate::protocols::common::{SamplingOptions, StopConditions}; use dynamo_runtime::{ pipeline::Context, pipeline::{network::Ingress, PushRouter}, @@ -486,11 +534,17 @@ mod integration_tests { tracing::info!("✓ Router created"); // Create test requests for both DP workers - let create_request = |tokens: Vec, dp_rank: u32| DirectRequest { - tokens, - max_output_tokens: TOKENS_PER_REQUEST, - uuid: None, - dp_rank: Some(dp_rank), + let create_request = |tokens: Vec, dp_rank: u32| PreprocessedRequest { + token_ids: tokens, + stop_conditions: StopConditions { + max_tokens: Some(TOKENS_PER_REQUEST as u32), + ..Default::default() + }, + sampling_options: SamplingOptions::default(), + eos_token_ids: vec![], + mdc_sum: None, + annotations: vec![format!("dp_rank:{}", dp_rank)], + estimated_prefix_hit_num_blocks: None, }; let requests = vec![ @@ -509,36 +563,46 @@ mod integration_tests { tracing::info!("Testing request {}", i + 1); let response_stream = router.generate(Context::new(request)).await?; - let responses: Vec> = response_stream.collect().await; + let responses: Vec = response_stream.collect().await; - // Verify each stream produces exactly the expected number of tokens - assert_eq!( - responses.len(), - TOKENS_PER_REQUEST, - "Request {} should produce {} tokens, got {}", - i + 1, - TOKENS_PER_REQUEST, - responses.len() + // Should have at least one response + assert!( + !responses.is_empty(), + "Request {} should produce at least one response", + i + 1 ); - // Verify all tokens contain valid data - for (j, token) in responses.iter().enumerate() { - if let Some(char_data) = &token.data { - assert!( - !char_data.is_empty(), - "Request {} token {} should not be empty", - i + 1, - j + 1 - ); - } else { - panic!("Request {} token {} should have data", i + 1, j + 1); + // Count total tokens generated (excluding final message) + let mut total_tokens = 0; + let mut has_finish_reason = false; + + for response in &responses { + total_tokens += response.token_ids.len(); + if response.finish_reason.is_some() { + has_finish_reason = true; } } + // Should have a finish reason in the last response + assert!( + has_finish_reason, + "Request {} should have a finish reason", + i + 1 + ); + + // Verify we got approximately the expected number of tokens + assert!( + total_tokens <= TOKENS_PER_REQUEST + 1, // +1 for potential final empty response + "Request {} generated {} tokens, expected at most {}", + i + 1, + total_tokens, + TOKENS_PER_REQUEST + 1 + ); + tracing::info!( "✓ Request {} completed successfully with {} tokens", i + 1, - responses.len() + total_tokens ); } From 2fbf998f8d1e1651735ae2d2b71d49a604125e24 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 16 Jun 2025 17:20:00 -0700 Subject: [PATCH 14/36] hook up with dynamo run --- launch/dynamo-run/src/flags.rs | 36 ++++++++++++++++++++++++---- launch/dynamo-run/src/lib.rs | 44 ++++++++++++++++++++++++++++++++++ launch/dynamo-run/src/opt.rs | 12 +++++++++- lib/llm/src/mocker/engine.rs | 41 +++++++++++++++++++++++++++++-- 4 files changed, 126 insertions(+), 7 deletions(-) diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs index 2ac7286302..961fb2f800 100644 --- a/launch/dynamo-run/src/flags.rs +++ b/launch/dynamo-run/src/flags.rs @@ -151,6 +151,18 @@ pub struct Flags { /// These are the command line arguments to the python engine when using `pystr` or `pytok`. #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)] pub last: Vec, + + /// Mocker engine configuration from a JSON file. + /// Example file contents: + /// { + /// "speedup_ratio": 1.0, + /// "dp_size": 1, + /// "num_gpu_blocks": 16384, + /// "max_num_batched_tokens": 8192, + /// "watermark": 0.01 + /// } + #[arg(long)] + pub extra_mocker_args: Option, } impl Flags { @@ -216,12 +228,12 @@ impl Flags { out } - /// Load extra engine arguments from a JSON file + /// Load extra arguments from a JSON file /// Returns a HashMap of parameter names to values - pub fn load_extra_engine_args( - &self, + fn load_json_args( + path: &Option, ) -> anyhow::Result>> { - if let Some(path) = &self.extra_engine_args { + if let Some(path) = path { let file_content = std::fs::read_to_string(path)?; let args: HashMap = serde_json::from_str(&file_content)?; Ok(Some(args)) @@ -229,6 +241,22 @@ impl Flags { Ok(None) } } + + /// Load extra engine arguments from a JSON file + /// Returns a HashMap of parameter names to values + pub fn load_extra_engine_args( + &self, + ) -> anyhow::Result>> { + Self::load_json_args(&self.extra_engine_args) + } + + /// Load extra mocker arguments from a JSON file + /// Returns a HashMap of parameter names to values + pub fn load_extra_mocker_args( + &self, + ) -> anyhow::Result>> { + Self::load_json_args(&self.extra_mocker_args) + } } #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)] diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index 82f0841a28..c2b972721e 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -285,6 +285,50 @@ pub async fn run( model: Box::new(local_model), } } + + Output::Mocker => { + // Load mocker args from JSON file if provided + let mocker_args = flags.load_extra_mocker_args()?; + + let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder(); + + // Use kv_cache_block_size flag as block_size if provided + if let Some(block_size) = flags.kv_cache_block_size { + builder = builder.block_size(block_size); + } + + // Apply args from JSON file if provided + if let Some(args) = mocker_args { + if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) { + builder = builder.speedup_ratio(v); + } + if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) { + builder = builder.dp_size(v as u32); + } + if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) { + builder = builder.num_gpu_blocks(v as usize); + } + if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) { + builder = builder.max_num_batched_tokens(Some(v as usize)); + } + if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) { + builder = builder.watermark(v); + } + } + + let args = builder + .build() + .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {}", e))?; + + let engine = dynamo_llm::mocker::engine::make_mocker_engine(args) + .await + .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?; + + EngineConfig::StaticCore { + engine, + model: Box::new(local_model), + } + } }; match in_opt { diff --git a/launch/dynamo-run/src/opt.rs b/launch/dynamo-run/src/opt.rs index 25ab953eb8..3e0d708206 100644 --- a/launch/dynamo-run/src/opt.rs +++ b/launch/dynamo-run/src/opt.rs @@ -90,6 +90,9 @@ pub enum Output { /// Listen for models on nats/etcd, add/remove dynamically Dynamic, + /// Mock vLLM engine for testing and development + Mocker, + #[cfg(feature = "mistralrs")] /// Run inference on a model in a GGUF file using mistralrs w/ candle MistralRs, @@ -126,6 +129,7 @@ impl TryFrom<&str> for Output { "echo_full" => Ok(Output::EchoFull), "echo_core" => Ok(Output::EchoCore), + "mocker" => Ok(Output::Mocker), "dyn" => Ok(Output::Dynamic), @@ -160,6 +164,8 @@ impl fmt::Display for Output { Output::EchoCore => "echo_core", Output::Dynamic => "dyn", + + Output::Mocker => "mocker", }; write!(f, "{s}") } @@ -168,7 +174,11 @@ impl fmt::Display for Output { impl Output { #[allow(unused_mut)] pub fn available_engines() -> Vec { - let mut out = vec!["echo_core".to_string(), "echo_full".to_string()]; + let mut out = vec![ + "echo_core".to_string(), + "echo_full".to_string(), + "mocker".to_string(), + ]; #[cfg(feature = "mistralrs")] { out.push(Output::MistralRs.to_string()); diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index c7405d95e0..ca8a1f6d26 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -19,6 +19,7 @@ //! to provide streaming token generation with realistic timing simulation. use crate::kv_router::publisher::WorkerMetricsPublisher; +use dynamo_runtime::protocols::annotated::Annotated; use crate::mocker::protocols::DirectRequest; use crate::mocker::protocols::{MockEngineArgs, OutputSignal}; use crate::mocker::scheduler::Scheduler; @@ -43,11 +44,12 @@ use tokio::sync::{mpsc, Mutex}; use tokio::time::{interval, Duration}; use tokio_stream::wrappers::ReceiverStream; use uuid::Uuid; +use futures::StreamExt; -/// Generate a random token ID from 0 to 50k +/// Generate a random token ID from 0 to 5k fn generate_random_token() -> TokenIdType { let mut rng = rand::rng(); - rng.random_range(1..50000) + rng.random_range(1..5000) } /// AsyncEngine wrapper around the Scheduler that generates random character tokens @@ -425,6 +427,41 @@ impl AsyncEngine, ManyOut, Error> } } +pub struct AnnotatedMockEngine { + inner: Arc, +} + +impl AnnotatedMockEngine { + pub fn new(inner: Arc) -> Self { + Self { inner } + } +} + +#[async_trait] +impl AsyncEngine, ManyOut>, Error> + for AnnotatedMockEngine +{ + async fn generate( + &self, + input: SingleIn, + ) -> Result>, Error> { + let stream = self.inner.generate(input).await?; + let context = stream.context(); + + // Convert stream of LLMEngineOutput to Annotated + let annotated_stream = stream.map(Annotated::from_data); + + Ok(ResponseStream::new(Box::pin(annotated_stream), context)) + } +} + +/// Create a mocker engine as ExecutionContext +pub async fn make_mocker_engine(args: MockEngineArgs) -> Result { + let engine = MockVllmEngine::new(args, None, None).await?; + let annotated = AnnotatedMockEngine::new(Arc::new(engine)); + Ok(Arc::new(annotated)) +} + #[cfg(test)] mod integration_tests { use super::*; From b5480503e2919ef7682cb45b57317b097499498c Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 16 Jun 2025 17:44:03 -0700 Subject: [PATCH 15/36] docs --- docs/guides/dynamo_run.md | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 2f980f1158..702f628704 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm. Usage: ``` -dynamo-run in=[http|text|dyn://|batch:] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path ] [--model-name ] [--model-config ] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)] +dynamo-run in=[http|text|dyn://|batch:] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path ] [--model-name ] [--model-config ] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--extra-mocker-args=args_mocker.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)] ``` Example: `dynamo run Qwen/Qwen3-0.6B` @@ -514,6 +514,39 @@ The output looks like this: {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855} ``` +#### Mocker engine + +The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for: + +- Testing distributed system components without GPU resources +- Benchmarking infrastructure and networking overhead +- Developing and debugging Dynamo components +- Load testing and performance analysis + +**Basic usage:** + +```bash +dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 +``` + +The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. + +Available options: +- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. +- `dp_size`: Number of data parallel workers to simulate (default: 1) +- `num_gpu_blocks`: Number of GPU blocks to simulate for the KV cache (default: 16384). This is normally calculated automatically by the real vllm engine based on the VRAM size and model KV cache size. +- `max_num_batched_tokens`: Maximum number of tokens that can be batched together (default: 8192) +- `watermark`: KV cache watermark threshold as a fraction (default: 0.01) + +**Example with custom settings:** +```bash +# Create configuration file +echo '{"speedup_ratio": 10.0, "dp_size": 4}' > mocker_args.json + +# Run mocker with configuration +dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json +``` + ### Extra engine arguments The vllm and sglang backends support passing any argument the engine accepts. Put the arguments in a JSON file: From c7c4be5d57104d1d223bece851b503a65c000c1d Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 16 Jun 2025 17:51:20 -0700 Subject: [PATCH 16/36] fmt --- lib/llm/src/mocker/engine.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index ca8a1f6d26..3015b7eb58 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -19,12 +19,12 @@ //! to provide streaming token generation with realistic timing simulation. use crate::kv_router::publisher::WorkerMetricsPublisher; -use dynamo_runtime::protocols::annotated::Annotated; use crate::mocker::protocols::DirectRequest; use crate::mocker::protocols::{MockEngineArgs, OutputSignal}; use crate::mocker::scheduler::Scheduler; use crate::protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest}; use crate::protocols::TokenIdType; +use dynamo_runtime::protocols::annotated::Annotated; use tokio_util::sync::CancellationToken; use dynamo_runtime::{ @@ -37,6 +37,7 @@ use dynamo_runtime::{ use crate::kv_router::protocols::{KvCacheEvent, KvCacheEventData}; use crate::kv_router::publisher::KvEventPublisher; +use futures::StreamExt; use rand::Rng; use std::collections::HashMap; use std::sync::Arc; @@ -44,7 +45,6 @@ use tokio::sync::{mpsc, Mutex}; use tokio::time::{interval, Duration}; use tokio_stream::wrappers::ReceiverStream; use uuid::Uuid; -use futures::StreamExt; /// Generate a random token ID from 0 to 5k fn generate_random_token() -> TokenIdType { @@ -456,7 +456,9 @@ impl AsyncEngine, ManyOut Result { +pub async fn make_mocker_engine( + args: MockEngineArgs, +) -> Result { let engine = MockVllmEngine::new(args, None, None).await?; let annotated = AnnotatedMockEngine::new(Arc::new(engine)); Ok(Arc::new(annotated)) From 3ad77807e402afc1ecb953ca4da7fff0b42edf7d Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 16 Jun 2025 21:07:57 -0700 Subject: [PATCH 17/36] refactor --- lib/llm/src/mocker/engine.rs | 104 ++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 3015b7eb58..7b11867535 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -46,6 +46,8 @@ use tokio::time::{interval, Duration}; use tokio_stream::wrappers::ReceiverStream; use uuid::Uuid; +pub const MOCKER_COMPONENT: &str = "mocker"; + /// Generate a random token ID from 0 to 5k fn generate_random_token() -> TokenIdType { let mut rng = rand::rng(); @@ -53,56 +55,56 @@ fn generate_random_token() -> TokenIdType { } /// AsyncEngine wrapper around the Scheduler that generates random character tokens +#[derive(Clone)] pub struct MockVllmEngine { - schedulers: Vec, active_requests: Arc>>>, - dp_size: u32, - cancel_token: CancellationToken, + schedulers: Option>, + engine_args: MockEngineArgs, } impl MockVllmEngine { /// Create a new MockVllmEngine with the given parameters - pub async fn new( - args: MockEngineArgs, - component: Option, - cancel_token: Option, - ) -> Result { + pub fn new(args: MockEngineArgs) -> Self { let active_requests = Arc::new(Mutex::new(HashMap::< Uuid, mpsc::UnboundedSender, >::new())); - let cancel_token = cancel_token.unwrap_or_default(); + Self { + active_requests, + schedulers: None, + engine_args: args, + } + } - // Create schedulers and get their KV event receivers - let (schedulers, kv_event_receivers) = - Self::start_schedulers(args.clone(), active_requests.clone(), cancel_token.clone()); + pub async fn start(&mut self, component: Component) -> Result<()> { + let cancel_token = component.drt().runtime().child_token(); - Self::start_metrics_publishing(&schedulers, component.clone(), cancel_token.clone()) + let (schedulers, kv_event_receiver) = self.start_schedulers( + self.engine_args.clone(), + self.active_requests.clone(), + cancel_token.clone(), + ); + + Self::start_metrics_publishing(&schedulers, Some(component.clone()), cancel_token.clone()) .await?; // Start KV events publishing with the actual receivers from schedulers Self::start_kv_events_publishing( - kv_event_receivers, - component.clone(), - args.block_size, + kv_event_receiver, + Some(component.clone()), + self.engine_args.block_size, cancel_token.clone(), ) .await?; - let engine = Self { - schedulers, - active_requests, - dp_size: args.dp_size, - cancel_token, - }; - - Ok(engine) + Ok(()) } /// Create schedulers and spawn their background tasks for distributing token notifications /// Returns schedulers and their corresponding KV event receivers fn start_schedulers( + &mut self, args: MockEngineArgs, active_requests: Arc>>>, cancel_token: CancellationToken, @@ -110,13 +112,13 @@ impl MockVllmEngine { Vec, Vec>, ) { - let mut schedulers = Vec::new(); + let mut schedulers = Vec::::new(); let mut kv_event_receivers = Vec::new(); // Create multiple schedulers and their background tasks for dp_rank in 0..args.dp_size { // Create a shared output channel that this scheduler will use - let (output_tx, output_rx) = mpsc::unbounded_channel::(); + let (output_tx, mut output_rx) = mpsc::unbounded_channel::(); // Create a channel for KV events from this scheduler let (kv_events_tx, kv_events_rx) = mpsc::unbounded_channel::(); @@ -133,17 +135,14 @@ impl MockVllmEngine { kv_event_receivers.push(kv_events_rx); // Spawn a background task for this scheduler to distribute token notifications to active requests - let output_rx = Arc::new(Mutex::new(output_rx)); + // let output_rx = Arc::new(Mutex::new(output_rx)); let active_requests_clone = active_requests.clone(); let cancel_token_cloned = cancel_token.clone(); tokio::spawn(async move { loop { tokio::select! { - signal_result = async { - let mut rx = output_rx.lock().await; - rx.recv().await - } => { + signal_result = output_rx.recv() => { let Some(signal) = signal_result else { break; // Channel closed }; @@ -162,6 +161,7 @@ impl MockVllmEngine { }); } + self.schedulers = Some(schedulers.clone()); (schedulers, kv_event_receivers) } @@ -327,10 +327,10 @@ impl AsyncEngine, ManyOut, Error> .unwrap_or(0); // Validate dp_rank - if dp_rank >= self.dp_size { + if dp_rank >= self.engine_args.dp_size { return Err(Error::msg(format!( "dp_rank {} is out of bounds for dp_size {}", - dp_rank, self.dp_size + dp_rank, self.engine_args.dp_size ))); } @@ -355,7 +355,7 @@ impl AsyncEngine, ManyOut, Error> } // Send the request to the appropriate scheduler based on dp_rank - self.schedulers[dp_rank as usize] + self.schedulers.as_ref().unwrap()[dp_rank as usize] .receive(direct_request) .await; @@ -364,7 +364,6 @@ impl AsyncEngine, ManyOut, Error> let active_requests = self.active_requests.clone(); let async_context = ctx.context(); - let cancel_token = self.cancel_token.clone(); let max_tokens = request.stop_conditions.max_tokens.unwrap_or(100) as usize; // Spawn a task to handle the complex async logic @@ -373,7 +372,11 @@ impl AsyncEngine, ManyOut, Error> loop { tokio::select! { - Some(signal) = request_rx.recv() => { + maybe_signal = request_rx.recv() => { + let Some(signal) = maybe_signal else { + break; + }; + if signal.completed || token_count >= max_tokens { // Send final output with finish reason let final_output = if token_count >= max_tokens { @@ -408,11 +411,6 @@ impl AsyncEngine, ManyOut, Error> let _ = stream_tx.send(LLMEngineOutput::cancelled()).await; break; } - - _ = cancel_token.cancelled() => { - let _ = stream_tx.send(LLMEngineOutput::cancelled()).await; - break; - } } } @@ -428,13 +426,18 @@ impl AsyncEngine, ManyOut, Error> } pub struct AnnotatedMockEngine { - inner: Arc, + inner: MockVllmEngine, } impl AnnotatedMockEngine { - pub fn new(inner: Arc) -> Self { + pub fn new(inner: MockVllmEngine) -> Self { Self { inner } } + + pub async fn start(&self, component: Component) -> Result<()> { + self.inner.clone().start(component).await?; + Ok(()) + } } #[async_trait] @@ -459,9 +462,9 @@ impl AsyncEngine, ManyOut Result { - let engine = MockVllmEngine::new(args, None, None).await?; - let annotated = AnnotatedMockEngine::new(Arc::new(engine)); - Ok(Arc::new(annotated)) + Ok(Arc::new(AnnotatedMockEngine::new(MockVllmEngine::new( + args, + )))) } #[cfg(test)] @@ -495,7 +498,7 @@ mod integration_tests { // Create component for MockVllmEngine (needed for publishers) let test_component = distributed .namespace("test")? - .component("mock-vllm")? + .component(MOCKER_COMPONENT)? .service_builder() .create() .await?; @@ -509,7 +512,10 @@ mod integration_tests { .build() .unwrap(); - let engine = Arc::new(MockVllmEngine::new(args, Some(test_component.clone()), None).await?); + let mut engine = MockVllmEngine::new(args); + engine.start(test_component.clone()).await?; + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + let engine = Arc::new(engine); tracing::info!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE); // Set up KV events subscriber @@ -563,7 +569,7 @@ mod integration_tests { // Create client let client = distributed .namespace("test")? - .component("mock-vllm")? + .component(MOCKER_COMPONENT)? .endpoint("generate") .client() .await?; From c78bef274c1532e753cb4f526a2ecebd96c7d138 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 17 Jun 2025 01:23:26 -0700 Subject: [PATCH 18/36] works with kv router --- docs/guides/dynamo_run.md | 13 +--- launch/dynamo-run/src/lib.rs | 17 +++-- lib/llm/src/mocker/engine.rs | 122 ++++++++++++++++++++++---------- lib/llm/src/mocker/scheduler.rs | 5 ++ 4 files changed, 107 insertions(+), 50 deletions(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index aec0b9d308..517e8381bb 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -525,10 +525,6 @@ The mocker engine is a mock vLLM implementation designed for testing and develop **Basic usage:** -```bash -dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 -``` - The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. Available options: @@ -538,13 +534,10 @@ Available options: - `max_num_batched_tokens`: Maximum number of tokens that can be batched together (default: 8192) - `watermark`: KV cache watermark threshold as a fraction (default: 0.01) -**Example with custom settings:** ```bash -# Create configuration file -echo '{"speedup_ratio": 10.0, "dp_size": 4}' > mocker_args.json - -# Run mocker with configuration -dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json +echo '{"speedup_ratio": 10.0}' > mocker_args.json +dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 +dynamo-run in=http out=dyn --router-mode kv ``` ### Extra engine arguments diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index c2b972721e..11e70830c1 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -58,6 +58,7 @@ pub async fn run( anyhow::bail!("Cannot use endpoint for both in and out"); } + let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?; let cancel_token = runtime.primary_token(); let maybe_path = flags .model_path_pos @@ -287,6 +288,11 @@ pub async fn run( } Output::Mocker => { + let endpoint = match &in_opt { + Input::Endpoint(path) => path.parse()?, + _ => internal_endpoint("mocker"), + }; + // Load mocker args from JSON file if provided let mocker_args = flags.load_extra_mocker_args()?; @@ -320,9 +326,13 @@ pub async fn run( .build() .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {}", e))?; - let engine = dynamo_llm::mocker::engine::make_mocker_engine(args) - .await - .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?; + let engine = dynamo_llm::mocker::engine::make_mocker_engine( + distributed_runtime.clone(), + endpoint, + args, + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?; EngineConfig::StaticCore { engine, @@ -355,7 +365,6 @@ pub async fn run( .await?; } Input::Endpoint(path) => { - let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?; crate::input::endpoint::run(distributed_runtime, path, engine_config).await?; } } diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 7b11867535..d4b8272007 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -25,6 +25,7 @@ use crate::mocker::scheduler::Scheduler; use crate::protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest}; use crate::protocols::TokenIdType; use dynamo_runtime::protocols::annotated::Annotated; +use dynamo_runtime::DistributedRuntime; use tokio_util::sync::CancellationToken; use dynamo_runtime::{ @@ -41,43 +42,38 @@ use futures::StreamExt; use rand::Rng; use std::collections::HashMap; use std::sync::Arc; -use tokio::sync::{mpsc, Mutex}; +use tokio::sync::{mpsc, Mutex, OnceCell}; use tokio::time::{interval, Duration}; use tokio_stream::wrappers::ReceiverStream; use uuid::Uuid; pub const MOCKER_COMPONENT: &str = "mocker"; -/// Generate a random token ID from 0 to 5k +/// Generate a random token ID from 1k to 5k fn generate_random_token() -> TokenIdType { let mut rng = rand::rng(); - rng.random_range(1..5000) + rng.random_range(1000..5000) } /// AsyncEngine wrapper around the Scheduler that generates random character tokens #[derive(Clone)] pub struct MockVllmEngine { active_requests: Arc>>>, - schedulers: Option>, + request_senders: Arc>>>, engine_args: MockEngineArgs, } impl MockVllmEngine { /// Create a new MockVllmEngine with the given parameters pub fn new(args: MockEngineArgs) -> Self { - let active_requests = Arc::new(Mutex::new(HashMap::< - Uuid, - mpsc::UnboundedSender, - >::new())); - Self { - active_requests, - schedulers: None, + active_requests: Arc::new(Mutex::new(HashMap::new())), + request_senders: Arc::new(OnceCell::new()), engine_args: args, } } - pub async fn start(&mut self, component: Component) -> Result<()> { + pub async fn start(&self, component: Component) -> Result<()> { let cancel_token = component.drt().runtime().child_token(); let (schedulers, kv_event_receiver) = self.start_schedulers( @@ -101,10 +97,15 @@ impl MockVllmEngine { Ok(()) } + pub fn direct(&self, request: DirectRequest, dp_rank: usize) { + let senders = self.request_senders.get().expect("Not initialized"); + let _ = senders[dp_rank].send(request); + } + /// Create schedulers and spawn their background tasks for distributing token notifications /// Returns schedulers and their corresponding KV event receivers fn start_schedulers( - &mut self, + &self, args: MockEngineArgs, active_requests: Arc>>>, cancel_token: CancellationToken, @@ -114,6 +115,7 @@ impl MockVllmEngine { ) { let mut schedulers = Vec::::new(); let mut kv_event_receivers = Vec::new(); + let mut senders = Vec::with_capacity(args.dp_size as usize); // Create multiple schedulers and their background tasks for dp_rank in 0..args.dp_size { @@ -131,6 +133,7 @@ impl MockVllmEngine { Some(cancel_token.clone()), ); + senders.push(scheduler.request_sender()); schedulers.push(scheduler); kv_event_receivers.push(kv_events_rx); @@ -161,7 +164,11 @@ impl MockVllmEngine { }); } - self.schedulers = Some(schedulers.clone()); + // Set the senders once + self.request_senders + .set(senders) + .expect("Already initialized"); + (schedulers, kv_event_receivers) } @@ -355,9 +362,7 @@ impl AsyncEngine, ManyOut, Error> } // Send the request to the appropriate scheduler based on dp_rank - self.schedulers.as_ref().unwrap()[dp_rank as usize] - .receive(direct_request) - .await; + self.direct(direct_request, dp_rank as usize); // Create a simple channel for the stream let (stream_tx, stream_rx) = mpsc::channel::(64); @@ -374,18 +379,17 @@ impl AsyncEngine, ManyOut, Error> tokio::select! { maybe_signal = request_rx.recv() => { let Some(signal) = maybe_signal else { + let _ = stream_tx.send(LLMEngineOutput::error("All output transmitters closed".to_string())).await; break; }; - if signal.completed || token_count >= max_tokens { - // Send final output with finish reason - let final_output = if token_count >= max_tokens { - LLMEngineOutput::length() - } else { - LLMEngineOutput::stop() - }; + if signal.completed && token_count < max_tokens { + let _ = stream_tx.send(LLMEngineOutput::error("Completion signal received before max tokens reached".to_string())).await; + break; + } - let _ = stream_tx.send(final_output).await; + if signal.completed { + let _ = stream_tx.send(LLMEngineOutput::length()).await; break; } @@ -426,17 +430,58 @@ impl AsyncEngine, ManyOut, Error> } pub struct AnnotatedMockEngine { - inner: MockVllmEngine, + inner: Arc, } impl AnnotatedMockEngine { - pub fn new(inner: MockVllmEngine) -> Self { - Self { inner } - } + pub fn new( + inner: MockVllmEngine, + distributed_runtime: DistributedRuntime, + endpoint: dynamo_runtime::protocols::Endpoint, + ) -> Self { + let inner = Arc::new(inner); + let inner_clone = inner.clone(); + + // Start background task to wait for component service and start the engine + tokio::spawn(async move { + loop { + // Try to create component + let Ok(namespace) = distributed_runtime.namespace(&endpoint.namespace) else { + tracing::debug!("Namespace not available yet, retrying..."); + tokio::time::sleep(Duration::from_millis(100)).await; + continue; + }; + + let Ok(component) = namespace.component(&endpoint.component) else { + tracing::debug!("Component not available yet, retrying..."); + tokio::time::sleep(Duration::from_millis(100)).await; + continue; + }; + + // Check if service is available by trying to list instances + let Ok(instances) = component.list_instances().await else { + tracing::debug!("Cannot list instances yet, retrying..."); + tokio::time::sleep(Duration::from_millis(100)).await; + continue; + }; + + if instances.is_empty() { + tracing::debug!("No instances available yet, retrying..."); + tokio::time::sleep(Duration::from_millis(100)).await; + continue; + } - pub async fn start(&self, component: Component) -> Result<()> { - self.inner.clone().start(component).await?; - Ok(()) + tracing::info!("Component service is now available, starting mocker engine"); + + // Start the engine with the component + if let Err(e) = inner_clone.start(component).await { + tracing::error!("Failed to start mocker engine: {}", e); + } + break; + } + }); + + Self { inner } } } @@ -460,11 +505,16 @@ impl AsyncEngine, ManyOut Result { - Ok(Arc::new(AnnotatedMockEngine::new(MockVllmEngine::new( - args, - )))) + // Create the mocker engine + tracing::info!("Creating mocker engine (service will be started in background)"); + let annotated_engine = + AnnotatedMockEngine::new(MockVllmEngine::new(args), distributed_runtime, endpoint); + + Ok(Arc::new(annotated_engine)) } #[cfg(test)] @@ -512,7 +562,7 @@ mod integration_tests { .build() .unwrap(); - let mut engine = MockVllmEngine::new(args); + let engine = MockVllmEngine::new(args); engine.start(test_component.clone()).await?; tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; let engine = Arc::new(engine); diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 255e2380fb..42aabe3c7b 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -417,6 +417,11 @@ impl Scheduler { let _ = self.request_tx.send(request); } + /// Expose the sender + pub fn request_sender(&self) -> mpsc::UnboundedSender { + self.request_tx.clone() + } + /// Get the count of waiting requests pub async fn waiting_count(&self) -> usize { let state = self.state.lock().await; From a206569b23185b7365657cc201332671bdc8b832 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 17 Jun 2025 02:55:17 -0700 Subject: [PATCH 19/36] actually load extra mocker args in guide --- docs/guides/dynamo_run.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 517e8381bb..be508f9544 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -536,7 +536,7 @@ Available options: ```bash echo '{"speedup_ratio": 10.0}' > mocker_args.json -dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 +dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json dynamo-run in=http out=dyn --router-mode kv ``` From d3730ffaf3dc533966f4f32d9ae51bbb94a00bff Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 23 Jun 2025 01:01:17 -0700 Subject: [PATCH 20/36] free blocks if failed to send (receiver dropped) --- lib/llm/src/mocker/scheduler.rs | 91 +++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 16 deletions(-) diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 42aabe3c7b..83c49f3de2 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -368,24 +368,19 @@ impl Scheduler { } } - // Send UUID notification for each generated token - if let Some(tx) = &output_tx_clone { - let signal = OutputSignal { - uuid, - completed: false, - }; - let _ = tx.send(signal); + // Check completion and send notification + let is_complete = sequence.generated_tokens() >= sequence.max_output_tokens(); + let send_failed = output_tx_clone.as_ref().is_some_and(|tx| { + tx.send(OutputSignal { uuid, completed: is_complete }).is_err() + }); + + if send_failed { + for signal in &sequence.free_signal() { + kv_manager_guard.process(signal); + } } - // Check if we're done after generating - if sequence.generated_tokens() >= sequence.max_output_tokens() { - if let Some(tx) = &output_tx_clone { - let signal = OutputSignal { - uuid, - completed: true, - }; - let _ = tx.send(signal); - } + if send_failed || is_complete { state_guard.complete(&uuid); continue; } @@ -753,4 +748,68 @@ mod tests { ); println!("Received {} tokens", received_tokens); } + + #[tokio::test] + async fn test_receiver_drop_cleans_up_resources() { + let block_size: usize = 64; + let input_tokens = 256; + let max_output_tokens = 200; // More than we'll receive + + // Create channel for token output + let (output_tx, mut output_rx) = mpsc::unbounded_channel::(); + + // Create scheduler args + let args = MockEngineArgs::builder() + .num_gpu_blocks(10) // Enough for 256 tokens (4 blocks) + .block_size(block_size) + .speedup_ratio(100.0) // Fast simulation + .build() + .unwrap(); + + // Create scheduler + let scheduler = Scheduler::new(args, None, Some(output_tx), None, None); + + // Create request with 256 tokens + let tokens: Vec = (0..input_tokens).map(|i| i as u32).collect(); + let request = DirectRequest { + tokens, + max_output_tokens, + uuid: None, + dp_rank: None, + }; + + scheduler.receive(request).await; + + // Receive exactly 129 tokens + let mut received_count = 0; + while received_count < 129 { + if let Some(_signal) = output_rx.recv().await { + received_count += 1; + } else { + panic!("Channel closed before receiving 129 tokens"); + } + } + + // Drop the receiver immediately + drop(output_rx); + + // Wait for 1 second to allow cleanup + tokio::time::sleep(Duration::from_secs(1)).await; + + // Check forward pass metrics + let metrics = scheduler.get_forward_pass_metrics().await; + + assert_eq!( + metrics.gpu_cache_usage_perc, + 0.0, + "Expected GPU cache usage to be 0%, got {}%", + metrics.gpu_cache_usage_perc * 100.0 + ); + + assert_eq!( + metrics.kv_active_blocks, 0, + "Expected 0 active blocks, got {}", + metrics.kv_active_blocks + ); + } } From 68d822a0b9bafb43879ad7675d54d8e221ef21d6 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Mon, 23 Jun 2025 08:48:31 -0700 Subject: [PATCH 21/36] do not regenereate tokens after pre-emption --- lib/llm/src/mocker/scheduler.rs | 16 ++++++++++------ lib/llm/src/mocker/sequence.rs | 10 +++++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 83c49f3de2..3d09e87d98 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -173,7 +173,6 @@ impl SchedulerState { self.prefill_costs.remove(&uuid); eprintln!("Request {} will be preempted", uuid); - // Extract the ActiveSequence from the Request enum // Reset the sequence and get the new sequence and signal // Insert the new sequence back into the requests map and add to waiting queue let Request::Active(mut active_sequence) = request else { @@ -370,9 +369,14 @@ impl Scheduler { // Check completion and send notification let is_complete = sequence.generated_tokens() >= sequence.max_output_tokens(); - let send_failed = output_tx_clone.as_ref().is_some_and(|tx| { - tx.send(OutputSignal { uuid, completed: is_complete }).is_err() - }); + let should_output = sequence.generated_tokens() > sequence.already_generated_tokens(); + + let mut send_failed = false; + if should_output { + send_failed = output_tx_clone.as_ref().is_some_and(|tx| { + tx.send(OutputSignal { uuid, completed: is_complete }).is_err() + }); + } if send_failed { for signal in &sequence.free_signal() { @@ -649,8 +653,8 @@ mod tests { // Assert that we received the expected number of tokens assert!( - received_tokens > expected_tokens, - "Received {} tokens but expected more than {}", + received_tokens == expected_tokens, + "Received {} tokens but expected exactly {}", received_tokens, expected_tokens ); diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index 17ef65d2c3..c3e0ed497e 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -58,6 +58,9 @@ pub struct ActiveSequence { #[getter(copy)] generated_tokens: usize, + #[getter(copy)] + already_generated_tokens: usize, + #[getter(copy)] num_input_tokens: usize, @@ -81,6 +84,7 @@ impl ActiveSequence { block_size, max_output_tokens, generated_tokens: 0, + already_generated_tokens: 0, num_input_tokens, creation_signal, } @@ -215,6 +219,7 @@ impl ActiveSequence { self.tokens.truncate(self.num_input_tokens).unwrap(); self.unique_blocks = create_unique_blocks_from_sequence(&self.tokens, None, self.block_size); + self.already_generated_tokens = self.generated_tokens.max(self.already_generated_tokens); self.generated_tokens = 0; self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone())); @@ -358,7 +363,7 @@ mod tests { seq1.push(token); } - // Push token 47 and get the signal - this completes the block and triggers signals + // Push token 48 and get the signal - this completes the block and triggers signals let signal = seq1.push(48); let signal = signal.unwrap(); @@ -382,6 +387,9 @@ mod tests { // Reset seq1 and check that it equals the original clone let free_signals = seq1.reset_with_signal(); + // 49 - 15 generated tokens + assert_eq!(seq1.already_generated_tokens, 34); + // Verify the reset signals include proper cleanup events assert!(!free_signals.is_empty()); } From d69edcf5dac94f10873a1a43c2912d78ed2a9151 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 20:29:43 -0700 Subject: [PATCH 22/36] evictor cleanup --- lib/llm/src/mocker/evictor.rs | 37 ++++++++++++----------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/lib/llm/src/mocker/evictor.rs b/lib/llm/src/mocker/evictor.rs index bd1f827ebe..d113adffa1 100644 --- a/lib/llm/src/mocker/evictor.rs +++ b/lib/llm/src/mocker/evictor.rs @@ -58,18 +58,14 @@ impl Default for LRUEvictor { } impl LRUEvictor { - /// Create a new LRUEvictor pub fn new(_cleanup_threshold: usize) -> Self { - // Keep the parameter for API compatibility, but ignore it Self::default() } - /// Get an iterator over the keys in the evictor pub fn keys(&self) -> std::collections::hash_map::Keys<'_, T, i64> { self.free_table.keys() } - /// Private helper method to update the data structures with object and counter fn _update(&mut self, object: T, counter: i64) { self.free_table.insert(object.clone(), counter); self.priority_queue.insert(PriorityItem { @@ -78,7 +74,6 @@ impl LRUEvictor { }); } - /// Insert or update an object in the evictor with positive counter pub fn insert(&mut self, object: T) { // Remove old entry if it exists if let Some(&old_counter) = self.free_table.get(&object) { @@ -112,7 +107,6 @@ impl LRUEvictor { self._update(object, counter); } - /// Check if the evictor contains the given object pub fn contains(&self, object: &T) -> bool { self.free_table.contains_key(object) } @@ -120,34 +114,29 @@ impl LRUEvictor { /// Evict an object based on LRU policy (lowest counter value) /// Returns the evicted object or None if no objects are available pub fn evict(&mut self) -> Option { - if let Some(item) = self.priority_queue.pop_first() { + self.priority_queue.pop_first().map(|item| { self.free_table.remove(&item.item); - Some(item.item) - } else { - None - } + item.item + }) } - /// Remove an object from the evictor pub fn remove(&mut self, object: &T) -> bool { - if let Some(&counter) = self.free_table.get(object) { - self.free_table.remove(object); - self.priority_queue.remove(&PriorityItem { - item: object.clone(), - counter, - }); - true - } else { - false - } + let Some(&counter) = self.free_table.get(object) else { + return false; + }; + + self.free_table.remove(object); + self.priority_queue.remove(&PriorityItem { + item: object.clone(), + counter, + }); + true } - /// Get the number of objects in the evictor pub fn len(&self) -> usize { self.free_table.len() } - /// Check if the evictor is empty pub fn is_empty(&self) -> bool { self.free_table.is_empty() } From c08f9eaca7fccfa2356073d5febb4bac18d4ad97 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 21:19:37 -0700 Subject: [PATCH 23/36] only need runtime in dynamic arms --- launch/dynamo-run/src/lib.rs | 27 +++++++++++++++++++++++---- lib/llm/src/mocker/engine.rs | 32 ++++++++++++++++---------------- lib/llm/src/mocker/evictor.rs | 6 +++--- lib/llm/src/mocker/kv_manager.rs | 31 +++++++++---------------------- lib/llm/src/mocker/scheduler.rs | 12 +++++------- 5 files changed, 56 insertions(+), 52 deletions(-) diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index 11e70830c1..05a2d9c4a3 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -9,6 +9,7 @@ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, local_mode use dynamo_runtime::protocols::Endpoint as EndpointId; use dynamo_runtime::slug::Slug; use dynamo_runtime::{CancellationToken, DistributedRuntime}; +use tokio::sync::OnceCell; mod flags; pub use flags::Flags; @@ -58,13 +59,27 @@ pub async fn run( anyhow::bail!("Cannot use endpoint for both in and out"); } - let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?; let cancel_token = runtime.primary_token(); let maybe_path = flags .model_path_pos .clone() .or(flags.model_path_flag.clone()); + // Create a OnceCell for lazy initialization of distributed runtime + let distributed_runtime_cell: OnceCell = OnceCell::new(); + let runtime_clone = runtime.clone(); + + // Helper closure to get or initialize the distributed runtime + let get_distributed_runtime = || async { + distributed_runtime_cell + .get_or_init(|| async { + DistributedRuntime::from_settings(runtime_clone.clone()) + .await + .expect("Failed to create distributed runtime") + }) + .await + }; + let mut local_model: LocalModel = if is_out_dynamic(&out_opt) { // If output is dynamic we are ingress and don't have a local model, but making an // empty one cleans up the code. @@ -324,15 +339,17 @@ pub async fn run( let args = builder .build() - .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {}", e))?; + .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {e}"))?; + // Get or initialize the distributed runtime + let distributed_runtime = get_distributed_runtime().await; let engine = dynamo_llm::mocker::engine::make_mocker_engine( distributed_runtime.clone(), endpoint, args, ) .await - .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?; + .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {e}"))?; EngineConfig::StaticCore { engine, @@ -365,7 +382,9 @@ pub async fn run( .await?; } Input::Endpoint(path) => { - crate::input::endpoint::run(distributed_runtime, path, engine_config).await?; + // Get or initialize the distributed runtime + let distributed_runtime = get_distributed_runtime().await; + crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?; } } diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index d4b8272007..bce5b47bb7 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -188,7 +188,7 @@ impl MockVllmEngine { let publisher = metrics_publisher.clone(); async move { if let Err(e) = publisher.create_endpoint(comp.clone()).await { - tracing::error!("Metrics endpoint failed: {}", e); + tracing::error!("Metrics endpoint failed: {e}"); } } }); @@ -216,13 +216,13 @@ impl MockVllmEngine { // Publish metrics if let Err(e) = publisher.publish(Arc::new(metrics)) { - tracing::warn!("Failed to publish metrics for DP rank {}: {}", dp_rank, e); + tracing::warn!("Failed to publish metrics for DP rank {dp_rank}: {e}"); } else { tracing::trace!("Published metrics for DP rank {}", dp_rank); } } _ = cancel_token.cancelled() => { - tracing::info!("Metrics publishing cancelled for DP rank {}", dp_rank); + tracing::info!("Metrics publishing cancelled for DP rank {dp_rank}"); break; } } @@ -256,7 +256,7 @@ impl MockVllmEngine { .expect("Cannot publish KV events without lease") // ← This will PANIC on static! .id(); // let worker_id = 0; - tracing::debug!("Worker_id set to: {}", worker_id); + tracing::debug!("Worker_id set to: {worker_id}"); tracing::info!("Creating KV event publisher"); let kv_event_publisher = Arc::new(KvEventPublisher::new( @@ -272,13 +272,13 @@ impl MockVllmEngine { kv_event_receivers.len() ); for (dp_rank, mut kv_events_rx) in kv_event_receivers.into_iter().enumerate() { - tracing::debug!("Starting background task for DP rank {}", dp_rank); + tracing::debug!("Starting background task for DP rank {dp_rank}"); let publisher = kv_event_publisher.clone(); let dp_rank = dp_rank as u32; let cancel_token = cancel_token.clone(); tokio::spawn(async move { - tracing::debug!("Background task started for DP rank {}", dp_rank); + tracing::debug!("Background task started for DP rank {dp_rank}"); loop { tokio::select! { // Receive actual KV events from the scheduler @@ -291,13 +291,13 @@ impl MockVllmEngine { // Publish the event if let Err(e) = publisher.publish(event) { - tracing::warn!("Failed to publish KV event for DP rank {}: {}", dp_rank, e); + tracing::warn!("Failed to publish KV event for DP rank {dp_rank}: {e}"); } else { - tracing::trace!("Published KV event for DP rank {}", dp_rank); + tracing::trace!("Published KV event for DP rank {dp_rank}"); } } _ = cancel_token.cancelled() => { - tracing::info!("KV events publishing cancelled for DP rank {}", dp_rank); + tracing::info!("KV events publishing cancelled for DP rank {dp_rank}"); break; } } @@ -475,7 +475,7 @@ impl AnnotatedMockEngine { // Start the engine with the component if let Err(e) = inner_clone.start(component).await { - tracing::error!("Failed to start mocker engine: {}", e); + tracing::error!("Failed to start mocker engine: {e}"); } break; } @@ -566,7 +566,7 @@ mod integration_tests { engine.start(test_component.clone()).await?; tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; let engine = Arc::new(engine); - tracing::info!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE); + tracing::info!("✓ MockVllmEngine created with DP_SIZE: {DP_SIZE}"); // Set up KV events subscriber let mut kv_events_subscriber = test_component.subscribe(KV_EVENT_SUBJECT).await?; @@ -587,7 +587,7 @@ mod integration_tests { .start() .await { - eprintln!("❌ Generate endpoint failed: {}", e); + eprintln!("❌ Generate endpoint failed: {e}"); } } }); @@ -612,7 +612,7 @@ mod integration_tests { } } Err(e) => { - tracing::error!("❌ Failed to list instances: {}", e); + tracing::error!("❌ Failed to list instances: {e}"); } } @@ -638,7 +638,7 @@ mod integration_tests { sampling_options: SamplingOptions::default(), eos_token_ids: vec![], mdc_sum: None, - annotations: vec![format!("dp_rank:{}", dp_rank)], + annotations: vec![format!("dp_rank:{dp_rank}")], estimated_prefix_hit_num_blocks: None, }; @@ -712,10 +712,10 @@ mod integration_tests { match serde_json::from_slice::(&msg.payload) { Ok(event) => { - tracing::info!("✓ Received KV event: {:?}", event); + tracing::info!("✓ Received KV event: {event:?}"); } Err(e) => { - return Err(Error::msg(format!("Failed to deserialize KV event: {}", e))); + return Err(Error::msg(format!("Failed to deserialize KV event: {e}"))); } } diff --git a/lib/llm/src/mocker/evictor.rs b/lib/llm/src/mocker/evictor.rs index d113adffa1..63d079180d 100644 --- a/lib/llm/src/mocker/evictor.rs +++ b/lib/llm/src/mocker/evictor.rs @@ -66,7 +66,7 @@ impl LRUEvictor { self.free_table.keys() } - fn _update(&mut self, object: T, counter: i64) { + fn update(&mut self, object: T, counter: i64) { self.free_table.insert(object.clone(), counter); self.priority_queue.insert(PriorityItem { item: object, @@ -87,7 +87,7 @@ impl LRUEvictor { self.positive_counter += 1; let counter = self.positive_counter; - self._update(object, counter); + self.update(object, counter); } /// Push an object to the front with negative counter (highest priority for eviction) @@ -104,7 +104,7 @@ impl LRUEvictor { self.negative_counter -= 1; let counter = self.negative_counter; - self._update(object, counter); + self.update(object, counter); } pub fn contains(&self, object: &T) -> bool { diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index 9d061b8ed0..d28e577c44 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -220,8 +220,7 @@ impl KvManager { let Some(ref_count) = self.active_blocks.remove(&uuid_block) else { let in_all_blocks = self.all_blocks.contains(&uuid_block); panic!( - "Missing active block for promotion: {:?}. Block still exists: {}", - uuid_block, in_all_blocks + "Missing active block for promotion: {uuid_block:?}. Block still exists: {in_all_blocks}" ); }; @@ -371,7 +370,7 @@ mod tests { ) { let response = rx .try_recv() - .unwrap_or_else(|_| panic!("Expected {} response {}", expected_type, description)); + .unwrap_or_else(|_| panic!("Expected {expected_type} response {description}")); match (&response, expected_type) { (MoveBlockResponse::Store(blocks, _parent_hash), "Store") => { @@ -384,8 +383,7 @@ mod tests { ); assert_eq!( *blocks, expected_blocks, - "Store blocks don't match expected {}", - description + "Store blocks don't match expected {description}" ); } (MoveBlockResponse::Remove(blocks), "Remove") => { @@ -398,14 +396,10 @@ mod tests { ); assert_eq!( *blocks, expected_blocks, - "Remove blocks don't match expected {}", - description + "Remove blocks don't match expected {description}" ); } - _ => panic!( - "Expected {} response, got {:?} {}", - expected_type, response, description - ), + _ => panic!("Expected {expected_type} response, got {response:?} {description}"), } } @@ -414,11 +408,7 @@ mod tests { rx: &mut mpsc::UnboundedReceiver, description: &str, ) { - assert!( - rx.try_recv().is_err(), - "Expected no response {}", - description - ); + assert!(rx.try_recv().is_err(), "Expected no response {description}",); } // Helper function to check if active blocks contain expected blocks with expected ref counts @@ -433,14 +423,12 @@ mod tests { let block = UniqueBlock::FullBlock(id); assert!( manager.active_blocks().contains_key(&block), - "Block {} not found in active blocks", - id + "Block {id} not found in active blocks", ); assert_eq!( manager.active_blocks().get(&block), Some(&ref_count), - "Block {} has wrong reference count", - id + "Block {id} has wrong reference count", ); } } @@ -463,8 +451,7 @@ mod tests { let block = UniqueBlock::FullBlock(id); assert!( inactive_blocks.iter().any(|&b| *b == block), - "Block {} not found in inactive blocks", - id + "Block {id} not found in inactive blocks", ); } } diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 3d09e87d98..2d7f73fe35 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -171,7 +171,7 @@ impl SchedulerState { .remove(&uuid) .expect("Request does not exist."); self.prefill_costs.remove(&uuid); - eprintln!("Request {} will be preempted", uuid); + eprintln!("Request {uuid} will be preempted"); // Reset the sequence and get the new sequence and signal // Insert the new sequence back into the requests map and add to waiting queue @@ -623,7 +623,7 @@ mod tests { // Manual debug ticker that prints forward pass metrics _ = debug_interval.tick() => { let _metrics = scheduler.get_forward_pass_metrics().await; - println!("Forward Pass Metrics: {:#?}", _metrics); + println!("Forward Pass Metrics: {_metrics:#?}"); } Some(_) = output_rx.recv() => { @@ -654,9 +654,7 @@ mod tests { // Assert that we received the expected number of tokens assert!( received_tokens == expected_tokens, - "Received {} tokens but expected exactly {}", - received_tokens, - expected_tokens + "Received {received_tokens} tokens but expected exactly {expected_tokens}" ); } @@ -715,7 +713,7 @@ mod tests { // Manual debug ticker that prints forward pass metrics _ = debug_interval.tick() => { let _metrics = scheduler.get_forward_pass_metrics().await; - println!("Forward Pass Metrics: {:#?}", _metrics); + println!("Forward Pass Metrics: {_metrics:#?}"); } Some(_signal) = output_rx.recv() => { @@ -750,7 +748,7 @@ mod tests { "Test passed! Cache hit rate: {:.3}", metrics.gpu_prefix_cache_hit_rate ); - println!("Received {} tokens", received_tokens); + println!("Received {received_tokens} tokens"); } #[tokio::test] From dee1413670782d07d0a864fadfe05e6c49b917c5 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 21:38:41 -0700 Subject: [PATCH 24/36] no separate extra-mocker-args --- docs/guides/dynamo_run.md | 12 +++++------- launch/dynamo-run/src/flags.rs | 20 -------------------- launch/dynamo-run/src/lib.rs | 20 +++++++++++++------- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index be508f9544..76e7a3e2b4 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm. Usage: ``` -dynamo-run in=[http|text|dyn://|batch:] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path ] [--model-name ] [--model-config ] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--extra-mocker-args=args_mocker.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)] +dynamo-run in=[http|text|dyn://|batch:] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path ] [--model-name ] [--model-config ] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)] ``` Example: `dynamo run Qwen/Qwen3-0.6B` @@ -525,18 +525,16 @@ The mocker engine is a mock vLLM implementation designed for testing and develop **Basic usage:** -The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. +The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine. -Available options: +And below are arguments that are mocker-specific: - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. - `dp_size`: Number of data parallel workers to simulate (default: 1) -- `num_gpu_blocks`: Number of GPU blocks to simulate for the KV cache (default: 16384). This is normally calculated automatically by the real vllm engine based on the VRAM size and model KV cache size. -- `max_num_batched_tokens`: Maximum number of tokens that can be batched together (default: 8192) -- `watermark`: KV cache watermark threshold as a fraction (default: 0.01) +- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg. ```bash echo '{"speedup_ratio": 10.0}' > mocker_args.json -dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json +dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json dynamo-run in=http out=dyn --router-mode kv ``` diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs index 961fb2f800..4b9f4d0a30 100644 --- a/launch/dynamo-run/src/flags.rs +++ b/launch/dynamo-run/src/flags.rs @@ -151,18 +151,6 @@ pub struct Flags { /// These are the command line arguments to the python engine when using `pystr` or `pytok`. #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)] pub last: Vec, - - /// Mocker engine configuration from a JSON file. - /// Example file contents: - /// { - /// "speedup_ratio": 1.0, - /// "dp_size": 1, - /// "num_gpu_blocks": 16384, - /// "max_num_batched_tokens": 8192, - /// "watermark": 0.01 - /// } - #[arg(long)] - pub extra_mocker_args: Option, } impl Flags { @@ -249,14 +237,6 @@ impl Flags { ) -> anyhow::Result>> { Self::load_json_args(&self.extra_engine_args) } - - /// Load extra mocker arguments from a JSON file - /// Returns a HashMap of parameter names to values - pub fn load_extra_mocker_args( - &self, - ) -> anyhow::Result>> { - Self::load_json_args(&self.extra_mocker_args) - } } #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)] diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index 05a2d9c4a3..d62d11201c 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -309,7 +309,7 @@ pub async fn run( }; // Load mocker args from JSON file if provided - let mocker_args = flags.load_extra_mocker_args()?; + let engine_args = flags.load_extra_engine_args()?; let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder(); @@ -319,12 +319,10 @@ pub async fn run( } // Apply args from JSON file if provided - if let Some(args) = mocker_args { - if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) { - builder = builder.speedup_ratio(v); - } - if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) { - builder = builder.dp_size(v as u32); + if let Some(args) = engine_args { + // This overwrites the kv_cache_block_size passed in + if let Some(v) = args.get("block_size").and_then(|v| v.as_u64()) { + builder = builder.block_size(v as usize); } if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) { builder = builder.num_gpu_blocks(v as usize); @@ -332,9 +330,17 @@ pub async fn run( if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) { builder = builder.max_num_batched_tokens(Some(v as usize)); } + + // These are mocker-specific args + if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) { + builder = builder.speedup_ratio(v); + } if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) { builder = builder.watermark(v); } + if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) { + builder = builder.dp_size(v as u32); + } } let args = builder From 99fd3f2a52c85586b34cb9e6203937bbf3343d64 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 21:44:24 -0700 Subject: [PATCH 25/36] update to match batched tokens --- lib/llm/src/mocker/engine.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index bce5b47bb7..1fafc1322a 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -404,6 +404,7 @@ impl AsyncEngine, ManyOut, Error> cum_log_probs: None, log_probs: None, finish_reason: None, + index: None, }; if stream_tx.send(output).await.is_err() { @@ -631,6 +632,7 @@ mod integration_tests { // Create test requests for both DP workers let create_request = |tokens: Vec, dp_rank: u32| PreprocessedRequest { token_ids: tokens, + batch_token_ids: None, stop_conditions: StopConditions { max_tokens: Some(TOKENS_PER_REQUEST as u32), ..Default::default() From 85c7ccfb108539a955b007f87718de5c28efc8fd Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 21:58:05 -0700 Subject: [PATCH 26/36] max-num-seqs --- docs/guides/dynamo_run.md | 2 +- launch/dynamo-run/src/lib.rs | 3 +++ lib/llm/src/mocker/protocols.rs | 3 +++ lib/llm/src/mocker/scheduler.rs | 15 ++++++++++++--- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 76e7a3e2b4..5a231e9f89 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -525,7 +525,7 @@ The mocker engine is a mock vLLM implementation designed for testing and develop **Basic usage:** -The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine. +The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine. And below are arguments that are mocker-specific: - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index d62d11201c..4b620a4f56 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -327,6 +327,9 @@ pub async fn run( if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) { builder = builder.num_gpu_blocks(v as usize); } + if let Some(v) = args.get("max_num_seqs").and_then(|v| v.as_u64()) { + builder = builder.max_num_seqs(Some(v as usize)); + } if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) { builder = builder.max_num_batched_tokens(Some(v as usize)); } diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs index 7fd8895594..2b759eefe6 100644 --- a/lib/llm/src/mocker/protocols.rs +++ b/lib/llm/src/mocker/protocols.rs @@ -91,6 +91,9 @@ pub struct MockEngineArgs { #[builder(default = "64")] pub block_size: usize, + #[builder(default = None)] + pub max_num_seqs: Option, + // default for open api server, for llm class it's 16384 #[builder(default = Some(8192))] pub max_num_batched_tokens: Option, diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 2d7f73fe35..4457725bd1 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -138,6 +138,10 @@ impl SchedulerState { Some(sequence) } + fn num_active_requests(&self) -> usize { + self.prefill.len() + self.decode.len() + } + /// Calculate the current running batched tokens fn num_batched_tokens(&self) -> usize { self.prefill_costs @@ -272,23 +276,28 @@ impl Scheduler { // schedule anymore. let mut current_blocks = kv_manager_guard.num_active_blocks(); let mut current_tokens = state_guard.num_batched_tokens(); + let mut current_seqs = state_guard.num_active_requests(); + while let Some((uuid, request)) = state_guard.next() { let active_sequence = get_active_sequence(request, args.block_size); // Update predictive budgets let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence); - let new_tokens = active_sequence.len(); - let new_blocks = (new_tokens + 1) / args.block_size; // this is conservative, assumes no cache hit + let total_tokens = active_sequence.len(); + let new_blocks = (total_tokens + 1) / args.block_size; // this is conservative, assumes no cache hit let new_tokens = prefill_cost.new_tokens; + current_blocks += new_blocks; current_tokens += new_tokens; + current_seqs += 1; // Check if it can be scheduled let under_block_budget = current_blocks as f64 <= (1. - args.watermark) * kv_manager_guard.max_capacity() as f64; let under_token_budget = args.max_num_batched_tokens.is_none_or(|limit| current_tokens <= limit); + let under_seq_budget = args.max_num_seqs.is_none_or(|limit| current_seqs <= limit); // Cannot schedule, put first in line instead - if !(under_block_budget && under_token_budget) { + if !(under_block_budget && under_token_budget && under_seq_budget) { state_guard.first_in_line(uuid, Request::Active(active_sequence)); break; } From ec1f360851ce35eb005730966fa089dd955ff485 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 22:22:51 -0700 Subject: [PATCH 27/36] enable_prefix_caching arg --- docs/guides/dynamo_run.md | 2 +- launch/dynamo-run/src/lib.rs | 3 +++ lib/llm/src/mocker/protocols.rs | 6 ++++- lib/llm/src/mocker/scheduler.rs | 28 +++++++++++++------ lib/llm/src/mocker/sequence.rs | 48 +++++++++++++++++++++++++-------- 5 files changed, 66 insertions(+), 21 deletions(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 5a231e9f89..952a77aa2d 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -525,7 +525,7 @@ The mocker engine is a mock vLLM implementation designed for testing and develop **Basic usage:** -The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine. +The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine. And below are arguments that are mocker-specific: - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index 4b620a4f56..4de3fc3358 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -333,6 +333,9 @@ pub async fn run( if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) { builder = builder.max_num_batched_tokens(Some(v as usize)); } + if let Some(v) = args.get("enable_prefix_caching").and_then(|v| v.as_bool()) { + builder = builder.enable_prefix_caching(v); + } // These are mocker-specific args if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) { diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs index 2b759eefe6..880b97495c 100644 --- a/lib/llm/src/mocker/protocols.rs +++ b/lib/llm/src/mocker/protocols.rs @@ -91,13 +91,17 @@ pub struct MockEngineArgs { #[builder(default = "64")] pub block_size: usize, - #[builder(default = None)] + // This was 1024 in the past but reverted back to 256 + #[builder(default = Some(256))] pub max_num_seqs: Option, // default for open api server, for llm class it's 16384 #[builder(default = Some(8192))] pub max_num_batched_tokens: Option, + #[builder(default = true)] + pub enable_prefix_caching: bool, + #[builder(default = "0.01")] pub watermark: f64, diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs index 4457725bd1..0223b04d5e 100644 --- a/lib/llm/src/mocker/scheduler.rs +++ b/lib/llm/src/mocker/scheduler.rs @@ -279,7 +279,7 @@ impl Scheduler { let mut current_seqs = state_guard.num_active_requests(); while let Some((uuid, request)) = state_guard.next() { - let active_sequence = get_active_sequence(request, args.block_size); + let active_sequence = get_active_sequence(request, args.block_size, args.enable_prefix_caching); // Update predictive budgets let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence); @@ -492,7 +492,11 @@ impl Scheduler { } /// Convert a Request to an ActiveSequence -fn get_active_sequence(request: Request, block_size: usize) -> ActiveSequence { +fn get_active_sequence( + request: Request, + block_size: usize, + enable_prefix_caching: bool, +) -> ActiveSequence { if let Request::Active(active_seq) = request { return active_seq; } @@ -505,6 +509,7 @@ fn get_active_sequence(request: Request, block_size: usize) -> ActiveSequence { direct_request.tokens, direct_request.max_output_tokens, Some(block_size), + enable_prefix_caching, ) } @@ -552,10 +557,15 @@ mod tests { use std::time::Duration; #[rstest] - #[case::random(false)] - #[case::caching(true)] + #[case::random_no_prefix_caching(false, false)] + #[case::random_with_prefix_caching(false, true)] + #[case::caching_no_prefix_caching(true, false)] + #[case::caching_with_prefix_caching(true, true)] #[tokio::test] - async fn test_scheduler_token_generation_patterns(#[case] use_shared_tokens: bool) { + async fn test_scheduler_token_generation_patterns( + #[case] use_shared_tokens: bool, + #[case] enable_prefix_caching: bool, + ) { std::env::set_var("RUST_LOG", "debug"); let kv_capacity: usize = 500; @@ -567,11 +577,12 @@ mod tests { // Create channel for token output let (output_tx, mut output_rx) = mpsc::unbounded_channel::(); - // Create scheduler args using builder + // Create scheduler args using builder - now including enable_prefix_caching let args = MockEngineArgs::builder() .num_gpu_blocks(kv_capacity) .block_size(block_size) .speedup_ratio(10.0) + .enable_prefix_caching(enable_prefix_caching) .build() .unwrap(); @@ -651,13 +662,14 @@ mod tests { // Calculate and print elapsed time let elapsed = start_time.elapsed(); println!( - "Test completed in: {:?} for {} case", + "Test completed in: {:?} for {} case with prefix_caching={}", elapsed, if use_shared_tokens { "caching" } else { "random" - } + }, + enable_prefix_caching ); // Assert that we received the expected number of tokens diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index c3e0ed497e..67cc12475e 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -24,11 +24,18 @@ fn create_unique_blocks_from_sequence( tokens: &TokenBlockSequence, uuid: Option, block_size: usize, + enable_prefix_caching: bool, ) -> Vec { let mut unique_blocks: Vec = tokens .blocks() .iter() - .map(|block| UniqueBlock::FullBlock(block.sequence_hash())) + .map(|block| { + if enable_prefix_caching { + UniqueBlock::FullBlock(block.sequence_hash()) + } else { + UniqueBlock::FullBlock(random::()) + } + }) .collect(); // Only push the partial block if tokens count isn't a multiple of block_size @@ -65,17 +72,26 @@ pub struct ActiveSequence { num_input_tokens: usize, creation_signal: Option, + + #[getter(copy)] + enable_prefix_caching: bool, } impl ActiveSequence { /// Create a new ActiveSequence instance with the provided tokens - pub fn new(tokens: Vec, max_output_tokens: usize, block_size: Option) -> Self { + pub fn new( + tokens: Vec, + max_output_tokens: usize, + block_size: Option, + enable_prefix_caching: bool, + ) -> Self { let block_size = block_size.unwrap_or(64); assert!(block_size > 1, "block_size must be greater than 1"); let num_input_tokens = tokens.len(); let tokens = Tokens::from(tokens).into_sequence(block_size, None); - let unique_blocks = create_unique_blocks_from_sequence(&tokens, None, block_size); + let unique_blocks = + create_unique_blocks_from_sequence(&tokens, None, block_size, enable_prefix_caching); let creation_signal = Some(MoveBlock::Use(unique_blocks.clone())); Self { @@ -87,6 +103,7 @@ impl ActiveSequence { already_generated_tokens: 0, num_input_tokens, creation_signal, + enable_prefix_caching, } } @@ -107,8 +124,9 @@ impl ActiveSequence { tokens: Vec, max_output_tokens: usize, block_size: Option, + enable_prefix_caching: bool, ) -> (Self, Option) { - let mut sequence = Self::new(tokens, max_output_tokens, block_size); + let mut sequence = Self::new(tokens, max_output_tokens, block_size, enable_prefix_caching); let signal = sequence.creation_signal.take(); (sequence, signal) } @@ -139,7 +157,11 @@ impl ActiveSequence { // Replace last partial block with full block if it exists if let Some(UniqueBlock::PartialBlock(uuid)) = self.unique_blocks.last().cloned() { - let last_block_hash = self.tokens.last_complete_block().unwrap().sequence_hash(); + let last_block_hash = if self.enable_prefix_caching { + self.tokens.last_complete_block().unwrap().sequence_hash() + } else { + random::() + }; self.unique_blocks.pop(); self.unique_blocks .push(UniqueBlock::FullBlock(last_block_hash)); @@ -212,13 +234,16 @@ impl ActiveSequence { } /// Reset the sequence to its initial state and return the free signals from freeing current blocks - /// maintaining the uuid of the last partial block pub fn reset_with_signal(&mut self) -> Vec { let free_signal = self.free_signal(); self.tokens.truncate(self.num_input_tokens).unwrap(); - self.unique_blocks = - create_unique_blocks_from_sequence(&self.tokens, None, self.block_size); + self.unique_blocks = create_unique_blocks_from_sequence( + &self.tokens, + None, + self.block_size, + self.enable_prefix_caching, + ); self.already_generated_tokens = self.generated_tokens.max(self.already_generated_tokens); self.generated_tokens = 0; self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone())); @@ -246,7 +271,8 @@ mod tests { fn test_active_sequence_push() { // Create a sequence with block size 16 initialized with tokens [0..15] let initial_tokens: Vec = (0..15).collect(); - let (mut seq1, signal1) = ActiveSequence::new_with_signal(initial_tokens, 100, Some(16)); + let (mut seq1, signal1) = + ActiveSequence::new_with_signal(initial_tokens, 100, Some(16), true); assert_eq!(seq1.num_input_tokens(), 15); assert_eq!(seq1.len(), 15); @@ -296,7 +322,7 @@ mod tests { // Create another sequence with block size 16 initialized with tokens [0..17] let extended_tokens: Vec = (0..16).collect(); - let (mut seq2, _) = ActiveSequence::new_with_signal(extended_tokens, 100, Some(16)); + let (mut seq2, _) = ActiveSequence::new_with_signal(extended_tokens, 100, Some(16), true); seq2.push(16); seq2.pop(); seq2.push(16); @@ -398,7 +424,7 @@ mod tests { fn test_active_sequence_generate_signals() { // Create a sequence with block size 16, max_output_tokens 4, initialized with tokens [0..14) let initial_tokens: Vec = (0..14).collect(); - let (mut seq, signal) = ActiveSequence::new_with_signal(initial_tokens, 5, Some(16)); + let (mut seq, signal) = ActiveSequence::new_with_signal(initial_tokens, 5, Some(16), true); // Initial signal - should have received a Use signal for the partial block assert!(signal.is_some()); From 94abc0dea8cc157f15c0a757ddac93681726901f Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 22:25:13 -0700 Subject: [PATCH 28/36] only publish kv events if enable_prefix_caching set true --- lib/llm/src/mocker/engine.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 1fafc1322a..068625f32e 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -86,13 +86,15 @@ impl MockVllmEngine { .await?; // Start KV events publishing with the actual receivers from schedulers - Self::start_kv_events_publishing( - kv_event_receiver, - Some(component.clone()), - self.engine_args.block_size, - cancel_token.clone(), - ) - .await?; + if self.engine_args.enable_prefix_caching { + Self::start_kv_events_publishing( + kv_event_receiver, + Some(component.clone()), + self.engine_args.block_size, + cancel_token.clone(), + ) + .await?; + } Ok(()) } From 35da284b03d44ad9eff64bb7b3e35b68a095b0af Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Sun, 29 Jun 2025 22:39:05 -0700 Subject: [PATCH 29/36] small note on chunked prefill being false for now --- docs/guides/dynamo_run.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 952a77aa2d..472099c568 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -532,6 +532,9 @@ And below are arguments that are mocker-specific: - `dp_size`: Number of data parallel workers to simulate (default: 1) - `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg. +>[!NOTE] +>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0). + ```bash echo '{"speedup_ratio": 10.0}' > mocker_args.json dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json From c7c072d7c915b404b35564a884eb10f0f48a8eb5 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 10:41:40 -0700 Subject: [PATCH 30/36] revert flags --- launch/dynamo-run/src/flags.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs index 4b9f4d0a30..2ac7286302 100644 --- a/launch/dynamo-run/src/flags.rs +++ b/launch/dynamo-run/src/flags.rs @@ -216,12 +216,12 @@ impl Flags { out } - /// Load extra arguments from a JSON file + /// Load extra engine arguments from a JSON file /// Returns a HashMap of parameter names to values - fn load_json_args( - path: &Option, + pub fn load_extra_engine_args( + &self, ) -> anyhow::Result>> { - if let Some(path) = path { + if let Some(path) = &self.extra_engine_args { let file_content = std::fs::read_to_string(path)?; let args: HashMap = serde_json::from_str(&file_content)?; Ok(Some(args)) @@ -229,14 +229,6 @@ impl Flags { Ok(None) } } - - /// Load extra engine arguments from a JSON file - /// Returns a HashMap of parameter names to values - pub fn load_extra_engine_args( - &self, - ) -> anyhow::Result>> { - Self::load_json_args(&self.extra_engine_args) - } } #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)] From de54247b85779b710fc2c7fd057ee8b507a4b845 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 10:46:21 -0700 Subject: [PATCH 31/36] revert dynamo-run changes --- docs/guides/dynamo_run.md | 29 +----------- launch/dynamo-run/src/lib.rs | 85 +----------------------------------- launch/dynamo-run/src/opt.rs | 12 +---- 3 files changed, 3 insertions(+), 123 deletions(-) diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md index 472099c568..0ed572a6d6 100644 --- a/docs/guides/dynamo_run.md +++ b/docs/guides/dynamo_run.md @@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm. Usage: ``` -dynamo-run in=[http|text|dyn://|batch:] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path ] [--model-name ] [--model-config ] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)] +dynamo-run in=[http|text|dyn://|batch:] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path ] [--model-name ] [--model-config ] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)] ``` Example: `dynamo run Qwen/Qwen3-0.6B` @@ -514,33 +514,6 @@ The output looks like this: {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855} ``` -#### Mocker engine - -The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for: - -- Testing distributed system components without GPU resources -- Benchmarking infrastructure and networking overhead -- Developing and debugging Dynamo components -- Load testing and performance analysis - -**Basic usage:** - -The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine. - -And below are arguments that are mocker-specific: -- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. -- `dp_size`: Number of data parallel workers to simulate (default: 1) -- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg. - ->[!NOTE] ->Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0). - -```bash -echo '{"speedup_ratio": 10.0}' > mocker_args.json -dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json -dynamo-run in=http out=dyn --router-mode kv -``` - ### Extra engine arguments The vllm and sglang backends support passing any argument the engine accepts. Put the arguments in a JSON file: diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index 4de3fc3358..d6ec1c9322 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -9,7 +9,6 @@ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, local_mode use dynamo_runtime::protocols::Endpoint as EndpointId; use dynamo_runtime::slug::Slug; use dynamo_runtime::{CancellationToken, DistributedRuntime}; -use tokio::sync::OnceCell; mod flags; pub use flags::Flags; @@ -65,21 +64,6 @@ pub async fn run( .clone() .or(flags.model_path_flag.clone()); - // Create a OnceCell for lazy initialization of distributed runtime - let distributed_runtime_cell: OnceCell = OnceCell::new(); - let runtime_clone = runtime.clone(); - - // Helper closure to get or initialize the distributed runtime - let get_distributed_runtime = || async { - distributed_runtime_cell - .get_or_init(|| async { - DistributedRuntime::from_settings(runtime_clone.clone()) - .await - .expect("Failed to create distributed runtime") - }) - .await - }; - let mut local_model: LocalModel = if is_out_dynamic(&out_opt) { // If output is dynamic we are ingress and don't have a local model, but making an // empty one cleans up the code. @@ -301,73 +285,6 @@ pub async fn run( model: Box::new(local_model), } } - - Output::Mocker => { - let endpoint = match &in_opt { - Input::Endpoint(path) => path.parse()?, - _ => internal_endpoint("mocker"), - }; - - // Load mocker args from JSON file if provided - let engine_args = flags.load_extra_engine_args()?; - - let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder(); - - // Use kv_cache_block_size flag as block_size if provided - if let Some(block_size) = flags.kv_cache_block_size { - builder = builder.block_size(block_size); - } - - // Apply args from JSON file if provided - if let Some(args) = engine_args { - // This overwrites the kv_cache_block_size passed in - if let Some(v) = args.get("block_size").and_then(|v| v.as_u64()) { - builder = builder.block_size(v as usize); - } - if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) { - builder = builder.num_gpu_blocks(v as usize); - } - if let Some(v) = args.get("max_num_seqs").and_then(|v| v.as_u64()) { - builder = builder.max_num_seqs(Some(v as usize)); - } - if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) { - builder = builder.max_num_batched_tokens(Some(v as usize)); - } - if let Some(v) = args.get("enable_prefix_caching").and_then(|v| v.as_bool()) { - builder = builder.enable_prefix_caching(v); - } - - // These are mocker-specific args - if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) { - builder = builder.speedup_ratio(v); - } - if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) { - builder = builder.watermark(v); - } - if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) { - builder = builder.dp_size(v as u32); - } - } - - let args = builder - .build() - .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {e}"))?; - - // Get or initialize the distributed runtime - let distributed_runtime = get_distributed_runtime().await; - let engine = dynamo_llm::mocker::engine::make_mocker_engine( - distributed_runtime.clone(), - endpoint, - args, - ) - .await - .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {e}"))?; - - EngineConfig::StaticCore { - engine, - model: Box::new(local_model), - } - } }; match in_opt { @@ -395,7 +312,7 @@ pub async fn run( } Input::Endpoint(path) => { // Get or initialize the distributed runtime - let distributed_runtime = get_distributed_runtime().await; + let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?; crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?; } } diff --git a/launch/dynamo-run/src/opt.rs b/launch/dynamo-run/src/opt.rs index 3e0d708206..25ab953eb8 100644 --- a/launch/dynamo-run/src/opt.rs +++ b/launch/dynamo-run/src/opt.rs @@ -90,9 +90,6 @@ pub enum Output { /// Listen for models on nats/etcd, add/remove dynamically Dynamic, - /// Mock vLLM engine for testing and development - Mocker, - #[cfg(feature = "mistralrs")] /// Run inference on a model in a GGUF file using mistralrs w/ candle MistralRs, @@ -129,7 +126,6 @@ impl TryFrom<&str> for Output { "echo_full" => Ok(Output::EchoFull), "echo_core" => Ok(Output::EchoCore), - "mocker" => Ok(Output::Mocker), "dyn" => Ok(Output::Dynamic), @@ -164,8 +160,6 @@ impl fmt::Display for Output { Output::EchoCore => "echo_core", Output::Dynamic => "dyn", - - Output::Mocker => "mocker", }; write!(f, "{s}") } @@ -174,11 +168,7 @@ impl fmt::Display for Output { impl Output { #[allow(unused_mut)] pub fn available_engines() -> Vec { - let mut out = vec![ - "echo_core".to_string(), - "echo_full".to_string(), - "mocker".to_string(), - ]; + let mut out = vec!["echo_core".to_string(), "echo_full".to_string()]; #[cfg(feature = "mistralrs")] { out.push(Output::MistralRs.to_string()); From 81c12aab7605cd04c537d8a684413a783cdda6f2 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 10:47:35 -0700 Subject: [PATCH 32/36] tiny reversion --- launch/dynamo-run/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index d6ec1c9322..529b597d83 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -313,7 +313,7 @@ pub async fn run( Input::Endpoint(path) => { // Get or initialize the distributed runtime let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?; - crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?; + crate::input::endpoint::run(distributed_runtime, path, engine_config).await?; } } From b959df4637dbd3f228d9d605079242f6da375b4b Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 10:48:15 -0700 Subject: [PATCH 33/36] another reversion --- launch/dynamo-run/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs index 529b597d83..82f0841a28 100644 --- a/launch/dynamo-run/src/lib.rs +++ b/launch/dynamo-run/src/lib.rs @@ -311,7 +311,6 @@ pub async fn run( .await?; } Input::Endpoint(path) => { - // Get or initialize the distributed runtime let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?; crate::input::endpoint::run(distributed_runtime, path, engine_config).await?; } From b15070a2e53f1a885a5b5e820f54a310cab7f0a5 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 10:59:07 -0700 Subject: [PATCH 34/36] usize reversion --- lib/llm/src/mocker/engine.rs | 2 +- lib/llm/src/mocker/kv_manager.rs | 2 +- lib/llm/src/mocker/sequence.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs index 068625f32e..b367910792 100644 --- a/lib/llm/src/mocker/engine.rs +++ b/lib/llm/src/mocker/engine.rs @@ -264,7 +264,7 @@ impl MockVllmEngine { let kv_event_publisher = Arc::new(KvEventPublisher::new( comp.clone(), worker_id, - block_size, + block_size as u32, None, )?); tracing::info!("KV event publisher created"); diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs index c2e7f8bb20..d28e577c44 100644 --- a/lib/llm/src/mocker/kv_manager.rs +++ b/lib/llm/src/mocker/kv_manager.rs @@ -58,7 +58,7 @@ pub struct KvManager { max_capacity: usize, #[getter(copy)] - block_size: u32, + block_size: usize, active_blocks: HashMap, diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index db8178ef1f..1c5c9fdacf 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -89,7 +89,7 @@ impl ActiveSequence { assert!(block_size > 1, "block_size must be greater than 1"); let num_input_tokens = tokens.len(); - let tokens = Tokens::from(tokens).into_sequence(block_size, None); + let tokens = Tokens::from(tokens).into_sequence(block_size as u32, None); let unique_blocks = create_unique_blocks_from_sequence(&tokens, None, block_size, enable_prefix_caching); let creation_signal = Some(MoveBlock::Use(unique_blocks.clone())); From 3a20b9dd05f555ba8a4f45aba668794581c47567 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 11:23:33 -0700 Subject: [PATCH 35/36] clippy --- lib/llm/src/mocker/sequence.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index 1c5c9fdacf..e0fb59a417 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -39,7 +39,7 @@ fn create_unique_blocks_from_sequence( .collect(); // Only push the partial block if tokens count isn't a multiple of block_size - if tokens.total_tokens() % (block_size as usize) != 0 { + if tokens.total_tokens() % block_size != 0 { unique_blocks.push(match uuid { Some(uuid) => UniqueBlock::PartialBlock(uuid), None => UniqueBlock::default(), @@ -108,7 +108,7 @@ impl ActiveSequence { } pub fn extra_tokens(&self) -> u32 { - (self.len() % self.block_size as usize) as u32 + (self.len() % self.block_size) as u32 } pub fn len(&self) -> usize { @@ -147,7 +147,7 @@ impl ActiveSequence { self.tokens.append(token).expect("Token push failed."); self.generated_tokens += 1; - if self.len() % (self.block_size as usize) != 1 { + if self.len() % self.block_size != 1 { return None; } @@ -257,7 +257,7 @@ impl ActiveSequence { self.generated_tokens = self.generated_tokens.saturating_sub(1); // Reverts to the last full block - if self.tokens.total_tokens() % (self.block_size as usize) == 0 { + if self.tokens.total_tokens() % self.block_size == 0 { self.unique_blocks.pop(); } } From c74760613b12927315f2b2f1e1c2a8420d8f64d2 Mon Sep 17 00:00:00 2001 From: PeaBrane Date: Tue, 1 Jul 2025 12:00:12 -0700 Subject: [PATCH 36/36] more clippy --- lib/llm/src/mocker/sequence.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs index e0fb59a417..2145d8e561 100644 --- a/lib/llm/src/mocker/sequence.rs +++ b/lib/llm/src/mocker/sequence.rs @@ -318,7 +318,7 @@ mod tests { // Verify state after pushing tokens assert_eq!(seq1.unique_blocks().len(), 2); // One full block and one partial block assert_eq!(seq1.len(), 17); - assert_eq!(seq1.len() % (seq1.block_size() as usize), 1); + assert_eq!(seq1.len() % seq1.block_size(), 1); // Create another sequence with block size 16 initialized with tokens [0..17] let extended_tokens: Vec = (0..16).collect(); @@ -367,12 +367,12 @@ mod tests { "seq2 should have exactly 3 blocks" ); assert_eq!( - seq1.len() % (seq1.block_size() as usize), + seq1.len() % seq1.block_size(), 1, "seq1 should have 1 partial token" ); assert_eq!( - seq2.len() % (seq2.block_size() as usize), + seq2.len() % seq2.block_size(), 1, "seq2 should have 1 partial token" );