From 74bc13f56487a2c3ee67052e7744e8b6f916dc5e Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 27 May 2025 14:00:14 -0700
Subject: [PATCH 01/36] use BTreeSet, and allow for push_front (preemption)

---
 lib/llm/src/mocker/evictor.rs | 204 +++++++++++++++++-----------------
 1 file changed, 103 insertions(+), 101 deletions(-)
diff --git a/lib/llm/src/mocker/evictor.rs b/lib/llm/src/mocker/evictor.rs
index 47a312eede..bd1f827ebe 100644
--- a/lib/llm/src/mocker/evictor.rs
+++ b/lib/llm/src/mocker/evictor.rs
@@ -13,60 +13,103 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::cmp::Eq;
-use std::collections::{HashMap, VecDeque};
+use std::cmp::{Eq, Ordering};
+use std::collections::{BTreeSet, HashMap};
 use std::hash::Hash;
-use std::time::Instant;
+
+/// A wrapper for (T, counter) that implements Ord based only on counter
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct PriorityItem<T> {
+    item: T,
+    counter: i64,
+}
+
+impl<T: Eq> Ord for PriorityItem<T> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.counter.cmp(&other.counter)
+    }
+}
+
+impl<T: Eq> PartialOrd for PriorityItem<T> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
 
 /// An LRU evictor that maintains objects and evicts them based on their
-/// last accessed time. Implements a "lazy" eviction mechanism where:
-/// 1. The priority queue does not immediately reflect updates or removes
-/// 2. Objects are pushed to the queue in order of increasing priority (older objects first)
-/// 3. The user must ensure objects are added in correct priority (temporal order)
-/// 4. Remove and update operations are lazy - entries remain in the queue until
-///    they are either evicted or cleaned up during maintenance
+/// priority counter. Lower counter values are evicted first.
 #[derive(Debug)]
 pub struct LRUEvictor<T: Clone + Eq + Hash> {
-    free_table: HashMap<T, f64>,
-    priority_queue: VecDeque<(T, f64)>,
-    cleanup_threshold: usize,
-    start_time: Instant,
+    free_table: HashMap<T, i64>,
+    priority_queue: BTreeSet<PriorityItem<T>>,
+    positive_counter: i64,
+    negative_counter: i64,
 }
 
 impl<T: Clone + Eq + Hash> Default for LRUEvictor<T> {
     fn default() -> Self {
         Self {
             free_table: HashMap::new(),
-            priority_queue: VecDeque::new(),
-            cleanup_threshold: 50,
-            start_time: Instant::now(),
+            priority_queue: BTreeSet::new(),
+            positive_counter: 0,
+            negative_counter: 0,
         }
     }
 }
 
 impl<T: Clone + Eq + Hash> LRUEvictor<T> {
-    /// Create a new LRUEvictor with the default cleanup threshold
-    pub fn new(cleanup_threshold: usize) -> Self {
-        Self {
-            cleanup_threshold,
-            ..Default::default()
-        }
-    }
-
-    /// Get the current timestamp as seconds since initialization
-    pub fn current_timestamp(&self) -> f64 {
-        self.start_time.elapsed().as_secs_f64()
+    /// Create a new LRUEvictor
+    pub fn new(_cleanup_threshold: usize) -> Self {
+        // Keep the parameter for API compatibility, but ignore it
+        Self::default()
     }
 
     /// Get an iterator over the keys in the evictor
-    pub fn keys(&self) -> std::collections::hash_map::Keys<'_, T, f64> {
+    pub fn keys(&self) -> std::collections::hash_map::Keys<'_, T, i64> {
         self.free_table.keys()
     }
 
-    /// Insert or update an object in the evictor with current timestamp
+    /// Private helper method to update the data structures with object and counter
+    fn _update(&mut self, object: T, counter: i64) {
+        self.free_table.insert(object.clone(), counter);
+        self.priority_queue.insert(PriorityItem {
+            item: object,
+            counter,
+        });
+    }
+
+    /// Insert or update an object in the evictor with positive counter
     pub fn insert(&mut self, object: T) {
-        let timestamp = self.current_timestamp();
-        self._insert(object, timestamp);
+        // Remove old entry if it exists
+        if let Some(&old_counter) = self.free_table.get(&object) {
+            self.priority_queue.remove(&PriorityItem {
+                item: object.clone(),
+                counter: old_counter,
+            });
+        }
+
+        // Increment positive counter and insert
+        self.positive_counter += 1;
+        let counter = self.positive_counter;
+
+        self._update(object, counter);
+    }
+
+    /// Push an object to the front with negative counter (highest priority for eviction)
+    pub fn push_front(&mut self, object: T) {
+        // Remove old entry if it exists
+        if let Some(&old_counter) = self.free_table.get(&object) {
+            self.priority_queue.remove(&PriorityItem {
+                item: object.clone(),
+                counter: old_counter,
+            });
+        }
+
+        // Decrement negative counter and insert
+        self.negative_counter -= 1;
+        let counter = self.negative_counter;
+
+        self._update(object, counter);
     }
 
     /// Check if the evictor contains the given object
@@ -74,39 +117,29 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
         self.free_table.contains_key(object)
     }
 
-    /// Evict an object based on LRU policy
+    /// Evict an object based on LRU policy (lowest counter value)
     /// Returns the evicted object or None if no objects are available
     pub fn evict(&mut self) -> Option<T> {
-        if self.free_table.is_empty() {
-            return None;
-        }
-
-        while let Some((object, last_accessed)) = self.priority_queue.pop_front() {
-            let Some(&current_last_accessed) = self.free_table.get(&object) else {
-                continue; // entry is already removed
-            };
-
-            if current_last_accessed == last_accessed {
-                self.free_table.remove(&object);
-                return Some(object);
-            } // otherwise entry is stale
+        if let Some(item) = self.priority_queue.pop_first() {
+            self.free_table.remove(&item.item);
+            Some(item.item)
+        } else {
+            None
         }
-
-        None
-    }
-
-    /// Insert or update an object in the evictor
-    fn _insert(&mut self, object: T, last_accessed: f64) {
-        self.free_table.insert(object.clone(), last_accessed);
-        self.priority_queue.push_back((object, last_accessed));
-        self.cleanup_if_necessary();
     }
 
     /// Remove an object from the evictor
-    /// We don't remove from the priority queue immediately, as that would be inefficient
-    /// Outdated entries will be filtered out during eviction or cleanup
     pub fn remove(&mut self, object: &T) -> bool {
-        self.free_table.remove(object).is_some()
+        if let Some(&counter) = self.free_table.get(object) {
+            self.free_table.remove(object);
+            self.priority_queue.remove(&PriorityItem {
+                item: object.clone(),
+                counter,
+            });
+            true
+        } else {
+            false
+        }
     }
 
     /// Get the number of objects in the evictor
@@ -118,62 +151,31 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
     pub fn is_empty(&self) -> bool {
         self.free_table.is_empty()
     }
-
-    /// Check if cleanup is necessary and perform it if needed
-    fn cleanup_if_necessary(&mut self) {
-        if self.priority_queue.len() > self.cleanup_threshold * self.free_table.len() {
-            self.cleanup();
-        }
-    }
-
-    /// Clean up the priority queue by removing outdated entries
-    fn cleanup(&mut self) {
-        let mut new_priority_queue = VecDeque::new();
-        for (object, timestamp) in self.priority_queue.drain(..) {
-            let Some(&current_timestamp) = self.free_table.get(&object) else {
-                continue;
-            };
-
-            if current_timestamp == timestamp {
-                new_priority_queue.push_back((object, timestamp));
-            }
-        }
-        self.priority_queue = new_priority_queue;
-    }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use rstest::rstest;
 
-    #[rstest]
-    #[case(1)]
-    #[case(2)]
-    #[case(3)]
-    fn test_lru_evictor_eviction_order(#[case] threshold: usize) {
-        // Create a new LRUEvictor with the given cleanup threshold
-        let mut evictor = LRUEvictor::<i32>::new(threshold);
+    #[test]
+    fn test_lru_evictor_eviction_order() {
+        // Create a new LRUEvictor
+        let mut evictor = LRUEvictor::<i32>::new(1); // threshold value doesn't matter anymore
 
-        // Add items in the specified order with small delays between each
+        // Add items in the specified order
         evictor.insert(4);
-        std::thread::sleep(std::time::Duration::from_millis(1));
         evictor.insert(3);
-        std::thread::sleep(std::time::Duration::from_millis(1));
         evictor.insert(2);
-        std::thread::sleep(std::time::Duration::from_millis(1));
         evictor.insert(1);
-        std::thread::sleep(std::time::Duration::from_millis(1));
         evictor.insert(5);
-        std::thread::sleep(std::time::Duration::from_millis(1));
-        evictor.insert(1); // Updates timestamp for 1
-        std::thread::sleep(std::time::Duration::from_millis(1));
-        evictor.insert(4); // Updates timestamp for 4
-        std::thread::sleep(std::time::Duration::from_millis(1));
-        evictor.insert(2); // Updates timestamp for 2
+        evictor.insert(1); // Updates counter for 1
+        evictor.insert(4); // Updates counter for 4
+        evictor.insert(2); // Updates counter for 2
+        evictor.push_front(4);
 
         // Verify the eviction order
-        println!("Testing with threshold {}", threshold);
+        let evicted = evictor.evict().unwrap();
+        assert_eq!(evicted, 4);
         let evicted = evictor.evict().unwrap();
         assert_eq!(evicted, 3);
         let evicted = evictor.evict().unwrap();
@@ -181,11 +183,11 @@ mod tests {
         let evicted = evictor.evict().unwrap();
         assert_eq!(evicted, 1);
         let evicted = evictor.evict().unwrap();
-        assert_eq!(evicted, 4);
-        let evicted = evictor.evict().unwrap();
         assert_eq!(evicted, 2);
         let evicted = evictor.evict();
         assert_eq!(evicted, None);
         assert_eq!(evictor.len(), 0);
     }
+
+    // ... existing test_push_front test ...
 }

From f2343d5090d5ead69f06e03b6fc2d30134b7c073 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 27 May 2025 14:18:05 -0700
Subject: [PATCH 02/36] preemption is push_front

---
 lib/llm/src/mocker/scheduler.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index e71647feab..d26f6ab5a2 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -174,7 +174,7 @@ impl SchedulerState {
 
         // Insert the new sequence back into the requests map and add to waiting queue
         self.requests.insert(uuid, Request::Active(active_sequence));
-        self.waiting.push_back(uuid);
+        self.waiting.push_front(uuid);
 
         Some(signals)
     }
@@ -546,7 +546,7 @@ mod tests {
                 // Manual debug ticker that prints forward pass metrics
                 _ = debug_interval.tick() => {
                     let _metrics = scheduler.get_forward_pass_metrics().await;
-                    // println!("Forward Pass Metrics: {:#?}", _metrics);
+                    println!("Forward Pass Metrics: {:#?}", _metrics);
                 }
 
                 Some(_) = output_rx.recv() => {

From 6fe3154115fcda4f49c0ab00a78b042540b1bb22 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 27 May 2025 14:58:54 -0700
Subject: [PATCH 03/36] use Hongkuan's quadratic formulas for decode and
 prefill

---
 lib/llm/src/mocker/kv_manager.rs |  7 ++++-
 lib/llm/src/mocker/scheduler.rs  | 50 +++++++++++++++++---------------
 lib/llm/src/mocker/sequence.rs   | 24 ++++-----------
 3 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index 8a7a8fefed..caa6fda110 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -200,6 +200,11 @@ impl KvManager {
         self.active_blocks.len()
     }
 
+    /// Get the percentage of active blocks relative to maximum capacity
+    pub fn get_active_perc(&self) -> f64 {
+        self.active_blocks.len() as f64 / self.max_capacity as f64
+    }
+
     /// Get the number of inactive blocks
     pub fn num_inactive_blocks(&self) -> usize {
         self.inactive_blocks.len()
@@ -261,7 +266,7 @@ impl KvManager {
 
         // Calculate prefill compute
         let prefill_compute =
-            new_tokens as f64 * (new_tokens + overlap_blocks * self.block_size) as f64;
+            1.25e-6 * (new_tokens as f64).powi(2) + 7.41e-2 * (new_tokens as f64) + 2.62e1;
 
         Some(PrefillCost {
             new_tokens,
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index d26f6ab5a2..e01b677e7e 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -194,7 +194,7 @@ impl Scheduler {
         kv_capacity: usize,
         watermark: f64,
         block_size: usize,
-        chunk_size: Option<usize>,
+        speedup_ratio: Option<f64>,
         output_tx: Option<mpsc::Sender<Uuid>>,
         cancellation_token: Option<CancellationToken>,
     ) -> Self {
@@ -205,7 +205,16 @@ impl Scheduler {
         let state = Arc::new(Mutex::new(SchedulerState::default()));
 
         let kv_manager = Arc::new(Mutex::new(kv_manager));
-        let chunk_size = chunk_size.unwrap_or(256);
+
+        // Assert speedup_ratio is greater than 0 if provided
+        if let Some(ratio) = speedup_ratio {
+            assert!(
+                ratio > 0.0,
+                "speedup_ratio must be greater than 0, got: {}",
+                ratio
+            );
+        }
+        let speedup_ratio = speedup_ratio.unwrap_or(1.0);
 
         // Create channel for request handling
         let (request_tx, mut request_rx) = mpsc::channel::<DirectRequest>(1024);
@@ -242,7 +251,7 @@ impl Scheduler {
                         // Process DirectRequests, converting them to ActiveSequence and scheduling them until we can't
                         // schedule anymore.
                         while let Some((uuid, request)) = state_guard.next() {
-                            let active_sequence = get_active_sequence(request, block_size, chunk_size);
+                            let active_sequence = get_active_sequence(request, block_size);
 
                             // Calculate token budget using new_tokens from PrefillCost
                             let total_prefill_tokens = state_guard.num_batched_tokens();
@@ -271,10 +280,10 @@ impl Scheduler {
                         let mut state_guard = state_clone.lock().await;
                         let mut kv_manager_guard = kv_manager_clone.lock().await;
 
-                        // Base time needed for decoding (assumed memory bound on KV cache)
-                        let active_tokens = kv_manager_guard.num_active_blocks() * block_size;
-                        // TODO: 2 is a dummy / magic scaling factor
-                        let mut generation_time = Duration::from_micros((active_tokens / 2) as u64);
+                        // Base time needed for decoding using active percentage and quadratic formula
+                        let active_perc = kv_manager_guard.get_active_perc();
+                        let decoding_time = -5.47 * active_perc.powi(2) + 43.88 * active_perc + 19.44;
+                        let mut total_time = Duration::from_secs_f64(decoding_time / 1000.0);
 
                         // Process each running request
                         let uuids: Vec<Uuid> = state_guard.running.keys().cloned().collect();
@@ -285,7 +294,7 @@ impl Scheduler {
                             }
 
                             // Get prefill compute value first
-                            let prefill_compute = state_guard.get_prefill_compute(&uuid);
+                            let prefill_compute = state_guard.get_prefill_compute(&uuid).unwrap_or(0.);
 
                             // Get the active sequence for this UUID
                             let sequence = state_guard.requests.get_mut(&uuid)
@@ -295,14 +304,6 @@ impl Scheduler {
                             // Generate token and get signals
                             let signals = sequence.generate();
 
-                            // Accumulate sleep duration based on prefill_compute if available
-                            // prefill compute = (cached_tokens + new_tokens) * new_tokens
-                            let sleep_ms = if let Some(compute) = prefill_compute {
-                                // TODO: 1024 is a dummy / magic scaling factor
-                                (compute / 1024.0) as u64
-                            } else { 0 };
-                            generation_time += Duration::from_micros(sleep_ms);
-
                             // Process all signals with the KvManager
                             // Handling of preemption on failure
                             if !process_signals(&mut kv_manager_guard, &signals) {
@@ -319,8 +320,10 @@ impl Scheduler {
                                 continue;
                             }
 
+                            // Accumulate sleep duration based on prefill_compute if available
+                            total_time += Duration::from_secs_f64(prefill_compute / 1000.0);
+
                             // Send UUID notification for each generated token
-                            // TODO: hook this up to an AsyncEngine
                             if let Some(tx) = &output_tx_clone {
                                 let _ = tx.try_send(uuid);
                             }
@@ -337,9 +340,10 @@ impl Scheduler {
                             }
                         }
 
-                        // Sleep once for the accumulated duration
-                        if generation_time.as_millis() > 0 {
-                            tokio::time::sleep(generation_time).await;
+                        // Sleep once for the adjusted duration
+                        let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / speedup_ratio);
+                        if adjusted_time.as_millis() > 0 {
+                            tokio::time::sleep(adjusted_time).await;
                         }
                     }
                 }
@@ -405,7 +409,7 @@ impl Scheduler {
 }
 
 /// Convert a Request to an ActiveSequence
-fn get_active_sequence(request: Request, block_size: usize, chunk_size: usize) -> ActiveSequence {
+fn get_active_sequence(request: Request, block_size: usize) -> ActiveSequence {
     if let Request::Active(active_seq) = request {
         return active_seq;
     }
@@ -418,7 +422,6 @@ fn get_active_sequence(request: Request, block_size: usize, chunk_size: usize) -
         direct_request.tokens,
         direct_request.max_output_tokens,
         Some(block_size),
-        Some(chunk_size),
     )
 }
 
@@ -475,7 +478,6 @@ mod tests {
         let kv_capacity: usize = 500;
         let watermark: f64 = 0.01; // 1% watermark
         let block_size: usize = 64;
-        let chunk_size: usize = 256;
         let num_requests: usize = 100;
         let input_len: usize = 1000;
         let max_output_tokens: usize = 100;
@@ -488,7 +490,7 @@ mod tests {
             kv_capacity,
             watermark,
             block_size,
-            Some(chunk_size),
+            Some(10.0), // speedup_ratio
             Some(output_tx),
             None,
         );
diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index d53dd870e1..b79e79872a 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -52,9 +52,6 @@ pub struct ActiveSequence {
     #[getter(copy)]
     block_size: usize,
 
-    #[getter(copy)]
-    chunk_size: usize, // TODO: not actually used
-
     #[getter(copy)]
     max_output_tokens: usize,
 
@@ -69,15 +66,9 @@ pub struct ActiveSequence {
 
 impl ActiveSequence {
     /// Create a new ActiveSequence instance with the provided tokens
-    pub fn new(
-        tokens: Vec<u32>,
-        max_output_tokens: usize,
-        block_size: Option<usize>,
-        chunk_size: Option<usize>,
-    ) -> Self {
+    pub fn new(tokens: Vec<u32>, max_output_tokens: usize, block_size: Option<usize>) -> Self {
         let block_size = block_size.unwrap_or(64);
         assert!(block_size > 1, "block_size must be greater than 1");
-        let chunk_size = chunk_size.unwrap_or(256);
         let num_input_tokens = tokens.len();
 
         let tokens = Tokens::from(tokens).into_sequence(block_size, None);
@@ -88,7 +79,6 @@ impl ActiveSequence {
             unique_blocks,
             tokens,
             block_size,
-            chunk_size,
             max_output_tokens,
             generated_tokens: 0,
             num_input_tokens,
@@ -113,9 +103,8 @@ impl ActiveSequence {
         tokens: Vec<u32>,
         max_output_tokens: usize,
         block_size: Option<usize>,
-        chunk_size: Option<usize>,
     ) -> (Self, Option<MoveBlock>) {
-        let mut sequence = Self::new(tokens, max_output_tokens, block_size, chunk_size);
+        let mut sequence = Self::new(tokens, max_output_tokens, block_size);
         let signal = sequence.creation_signal.take();
         (sequence, signal)
     }
@@ -237,8 +226,7 @@ mod tests {
     fn test_active_sequence_push() {
         // Create a sequence with block size 16 initialized with tokens [0..15]
         let initial_tokens: Vec<u32> = (0..15).collect();
-        let (mut seq1, signal1) =
-            ActiveSequence::new_with_signal(initial_tokens, 100, Some(16), Some(256));
+        let (mut seq1, signal1) = ActiveSequence::new_with_signal(initial_tokens, 100, Some(16));
         assert_eq!(seq1.num_input_tokens(), 15);
         assert_eq!(seq1.len(), 15);
 
@@ -289,8 +277,7 @@ mod tests {
 
         // Create another sequence with block size 16 initialized with tokens [0..17]
         let extended_tokens: Vec<u32> = (0..16).collect();
-        let (mut seq2, _) =
-            ActiveSequence::new_with_signal(extended_tokens, 100, Some(16), Some(256));
+        let (mut seq2, _) = ActiveSequence::new_with_signal(extended_tokens, 100, Some(16));
         seq2.push(16);
         seq2.pop();
         seq2.push(16);
@@ -363,8 +350,7 @@ mod tests {
     fn test_active_sequence_generate_signals() {
         // Create a sequence with block size 16, max_output_tokens 4, initialized with tokens [0..14)
         let initial_tokens: Vec<u32> = (0..14).collect();
-        let (mut seq, signal) =
-            ActiveSequence::new_with_signal(initial_tokens, 5, Some(16), Some(256));
+        let (mut seq, signal) = ActiveSequence::new_with_signal(initial_tokens, 5, Some(16));
 
         // Initial signal - should have received a Use signal for the partial block
         assert!(signal.is_some());

From cccebad010578507075b571f7e142f960109e2e8 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 28 May 2025 00:29:58 -0700
Subject: [PATCH 04/36] cleaner scheduling + generation separation, and
 waterline bug fix

---
 lib/llm/src/mocker/kv_manager.rs |  52 ++-----
 lib/llm/src/mocker/protocols.rs  |   1 +
 lib/llm/src/mocker/scheduler.rs  | 227 +++++++++++++++----------------
 3 files changed, 121 insertions(+), 159 deletions(-)

diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index caa6fda110..71522dcb58 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -178,7 +178,7 @@ impl KvManager {
     pub fn probe_new_blocks(&self, blocks: &[UniqueBlock]) -> usize {
         blocks
             .iter()
-            .filter(|&block| !self.all_blocks.contains(block))
+            .filter(|&block| !self.active_blocks.contains_key(block))
             .count()
     }
 
@@ -221,57 +221,21 @@ impl KvManager {
     }
 
     /// Check if a sequence can be scheduled and calculate cost if possible
-    pub fn try_schedule(
-        &self,
-        sequence: &ActiveSequence,
-        watermark: f64,
-        tokens_budget: usize,
-    ) -> Option<PrefillCost> {
-        // Return None immediately if tokens_budget is 0
-        if tokens_budget == 0 {
-            return None;
-        }
-
-        // Get unique blocks from the sequence
-        let unique_blocks = sequence.unique_blocks();
-
-        // Get the count of new blocks
-        let new_blocks = self.probe_new_blocks(unique_blocks);
-
-        // Calculate current usage and available capacity
-        let active_count = self.active_blocks.len();
-
-        // Check if we can schedule based on the watermark
-        if (active_count + new_blocks) as f64 > (1.0 - watermark) * self.max_capacity as f64 {
-            return None;
-        }
-
-        // Calculate overlap blocks
-        let overlap_blocks = unique_blocks.len() - new_blocks;
-
-        // Calculate new tokens
+    pub fn get_prefill_cost(&self, sequence: &ActiveSequence) -> PrefillCost {
+        let seq_blocks = sequence.unique_blocks();
+        let new_blocks = self.probe_new_blocks(seq_blocks);
+        let overlap_blocks = seq_blocks.len() - new_blocks;
         let new_tokens = sequence.num_input_tokens() - overlap_blocks * self.block_size;
 
-        // // Print the full equation with actual values substituted
-        // println!("{} = {} - ({} * {}) (new_tokens = num_input_tokens - overlap_blocks * block_size)",
-        //     new_tokens,
-        //     sequence.num_input_tokens(),
-        //     overlap_blocks,
-        //     self.block_size);
-
-        // Return None if new_tokens exceeds tokens_budget
-        if new_tokens > tokens_budget {
-            return None;
-        }
-
         // Calculate prefill compute
         let prefill_compute =
             1.25e-6 * (new_tokens as f64).powi(2) + 7.41e-2 * (new_tokens as f64) + 2.62e1;
 
-        Some(PrefillCost {
+        PrefillCost {
+            new_blocks,
             new_tokens,
             prefill_compute,
-        })
+        }
     }
 }
 
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
index 2b551db61b..51440ee9d4 100644
--- a/lib/llm/src/mocker/protocols.rs
+++ b/lib/llm/src/mocker/protocols.rs
@@ -57,6 +57,7 @@ pub struct DirectRequest {
 /// Represents the cost of prefilling content in the cache
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PrefillCost {
+    pub new_blocks: usize,
     pub new_tokens: usize,
     pub prefill_compute: f64,
 }
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index e01b677e7e..5f6776a9b1 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -63,8 +63,8 @@ pub enum Request {
 #[derive(Default)]
 struct SchedulerState {
     waiting: VecDeque<Uuid>,
-    ready: VecDeque<Uuid>,
-    running: LRUEvictor<Uuid>,
+    prefill: VecDeque<Uuid>,
+    decode: LRUEvictor<Uuid>,
     requests: HashMap<Uuid, Request>,
     prefill_costs: HashMap<Uuid, Option<PrefillCost>>,
 }
@@ -74,61 +74,66 @@ impl SchedulerState {
     fn receive(&mut self, request: DirectRequest) -> Uuid {
         // Use the provided UUID if available, otherwise generate a new one
         let uuid = request.uuid.unwrap_or_else(Uuid::new_v4);
-
-        // Add the request to the map and waiting queue
         self.requests.insert(uuid, Request::Direct(request));
         self.waiting.push_back(uuid);
         uuid
     }
 
     /// Get the next UUID from ready or waiting queue and its associated Request.
-    /// Returns from ready if not empty, otherwise from waiting, or None if both are empty.
-    /// Also removes the Request from the requests HashMap.
     fn next(&mut self) -> Option<(Uuid, Request)> {
-        let uuid = self
-            .ready
-            .pop_front()
-            .or_else(|| self.waiting.pop_front())?;
-        let request = self.requests.remove(&uuid)?;
+        let uuid = self.waiting.pop_front()?;
+        let request = self
+            .requests
+            .remove(&uuid)
+            .expect("Request does not exist.");
         Some((uuid, request))
     }
 
+    /// Move a UUID and its Request to the waiting queue (front).
+    fn first_in_line(&mut self, uuid: Uuid, request: Request) {
+        self.requests.insert(uuid, request);
+        self.waiting.push_front(uuid);
+    }
+
     /// Move a UUID and its Request to the ready queue.
-    fn make_ready(&mut self, uuid: Uuid, active_seq: ActiveSequence) {
+    fn start_prefill(&mut self, uuid: Uuid, active_seq: ActiveSequence, cost: Option<PrefillCost>) {
         self.requests.insert(uuid, Request::Active(active_seq));
-        self.ready.push_back(uuid);
+        self.prefill.push_back(uuid);
+        self.prefill_costs.insert(uuid, cost);
     }
 
-    /// Schedule the request with the given UUID.
-    /// Returns the creation signal from the ActiveSequence.
-    fn run(&mut self, uuid: Uuid, active_seq: ActiveSequence) -> MoveBlock {
-        // Insert the request into the map
-        self.requests.insert(uuid, Request::Active(active_seq));
+    /// Pop from prefill queue and move to decode queue.
+    /// Returns the prefill_compute value if available.
+    fn start_decode(&mut self) -> Option<(f64, MoveBlock)> {
+        let uuid = self.prefill.pop_front()?;
+        self.decode.insert(uuid);
+
+        // Remove and extract prefill_compute from prefill_costs
+        let prefill_cost = self
+            .prefill_costs
+            .remove(&uuid)
+            .flatten()
+            .expect("Expects valid prefill cost.");
 
-        // Get the creation signal
         let Some(Request::Active(sequence)) = self.requests.get(&uuid) else {
-            panic!("Failed to get ActiveSequence for UUID");
-        };
-        let Some(signal) = sequence.creation_signal() else {
-            panic!("Failed to get creation signal from ActiveSequence");
+            panic!("Request does not exist.");
         };
+        let creation_signal = sequence
+            .creation_signal()
+            .clone()
+            .expect("Must have creation signal.");
 
-        // Add to running requests
-        self.running.insert(uuid);
-        signal.clone()
-    }
-
-    /// Set the prefill cost for a UUID
-    fn set_prefill_cost(&mut self, uuid: Uuid, cost: Option<PrefillCost>) {
-        self.prefill_costs.insert(uuid, cost);
+        Some((prefill_cost.prefill_compute, creation_signal))
     }
 
-    /// Get the prefill compute value for a UUID if available
-    fn get_prefill_compute(&self, uuid: &Uuid) -> Option<f64> {
-        self.prefill_costs
-            .get(uuid)
-            .and_then(|cost| cost.as_ref())
-            .map(|cost| cost.prefill_compute)
+    fn run(&mut self, uuid: Uuid) -> Option<&mut ActiveSequence> {
+        if !self.decode.contains(&uuid) {
+            return None;
+        }
+        let Some(Request::Active(sequence)) = self.requests.get_mut(&uuid) else {
+            panic!("Request does not exist.");
+        };
+        Some(sequence)
     }
 
     /// Calculate the current running batched tokens
@@ -145,7 +150,7 @@ impl SchedulerState {
     /// Remove a UUID and its associated Request from collections.
     fn complete(&mut self, uuid: &Uuid) {
         // println!("Request {} will complete", uuid);
-        self.running.remove(uuid);
+        self.decode.remove(uuid);
         self.requests.remove(uuid);
         self.prefill_costs.remove(uuid);
     }
@@ -153,30 +158,29 @@ impl SchedulerState {
     /// Preempt the oldest running request by evicting it from running, resetting the sequence,
     /// and adding it back to the waiting queue.
     /// Returns the signal from reset_with_signal or None if no requests are running.
-    fn preempt(&mut self) -> Option<Vec<MoveBlock>> {
+    fn preempt(&mut self) -> Vec<MoveBlock> {
         // Evict the oldest UUID from running
-        let uuid = self.running.evict()?;
-        eprintln!("Request {} will be preempted", uuid);
-
-        // Remove the request from the requests HashMap and ensure it's an ActiveSequence
-        let request = self.requests.remove(&uuid)?;
-
-        // Remove the prefill cost to force recomputation
+        let uuid = self
+            .decode
+            .evict()
+            .expect("Nothing to evict for preemption.");
+        let request = self
+            .requests
+            .remove(&uuid)
+            .expect("Request does not exist.");
         self.prefill_costs.remove(&uuid);
+        eprintln!("Request {} will be preempted", uuid);
 
         // Extract the ActiveSequence from the Request enum
+        // Reset the sequence and get the new sequence and signal
+        // Insert the new sequence back into the requests map and add to waiting queue
         let Request::Active(mut active_sequence) = request else {
             panic!("Expected ActiveSequence in running queue")
         };
-
-        // Reset the sequence and get the new sequence and signal
         let signals = active_sequence.reset_with_signal();
+        self.first_in_line(uuid, Request::Active(active_sequence));
 
-        // Insert the new sequence back into the requests map and add to waiting queue
-        self.requests.insert(uuid, Request::Active(active_sequence));
-        self.waiting.push_front(uuid);
-
-        Some(signals)
+        signals
     }
 }
 
@@ -191,20 +195,19 @@ pub struct Scheduler {
 impl Scheduler {
     /// Create a new Scheduler with the given parameters
     pub fn new(
-        kv_capacity: usize,
-        watermark: f64,
+        num_gpu_blocks: usize,
         block_size: usize,
+        max_num_batched_tokens: Option<usize>,
+        watermark: Option<f64>,
         speedup_ratio: Option<f64>,
         output_tx: Option<mpsc::Sender<Uuid>>,
         cancellation_token: Option<CancellationToken>,
     ) -> Self {
-        // Create KvManager internally
-        let kv_manager = KvManager::new(kv_capacity, block_size);
+        let max_num_batched_tokens = max_num_batched_tokens.unwrap_or(8192);
+        let watermark = watermark.unwrap_or(0.01);
 
-        let token_capacity: usize = 8192;
         let state = Arc::new(Mutex::new(SchedulerState::default()));
-
-        let kv_manager = Arc::new(Mutex::new(kv_manager));
+        let kv_manager = Arc::new(Mutex::new(KvManager::new(num_gpu_blocks, block_size)));
 
         // Assert speedup_ratio is greater than 0 if provided
         if let Some(ratio) = speedup_ratio {
@@ -219,19 +222,17 @@ impl Scheduler {
         // Create channel for request handling
         let (request_tx, mut request_rx) = mpsc::channel::<DirectRequest>(1024);
 
-        // Use provided cancellation token or create new one
-        let cancellation_token = cancellation_token.unwrap_or_default();
-        let token_clone = cancellation_token.clone();
-
         // Create a clone for the background task
         let state_clone = state.clone();
         let kv_manager_clone = kv_manager.clone();
         let output_tx_clone = output_tx.clone();
+        let cancel_token_clone = cancellation_token.unwrap_or_default().clone();
 
         // Spawn main background task with cancellation token
         tokio::spawn(async move {
-            let mut schedule_interval = interval(Duration::from_millis(5));
-            let mut simulate_interval = interval(Duration::from_millis(1));
+            let mut schedule_interval = interval(Duration::from_secs_f64(1e-3));
+            let mut simulate_interval = interval(Duration::from_secs_f64(1e-4));
+            let mut should_schedule = true;
 
             loop {
                 tokio::select! {
@@ -243,35 +244,45 @@ impl Scheduler {
                         state.receive(request);
                     }
 
-                    // Try Scheduling Requests
+                    // Try Scheduling Requests - runs on normal interval or after simulation
                     _ = schedule_interval.tick() => {
+                        // Skip if we just ran scheduling after simulation to prevent consecutive runs
+                        if !should_schedule {
+                            continue;
+                        }
+
                         let mut state_guard = state_clone.lock().await;
-                        let mut kv_manager_guard = kv_manager_clone.lock().await;
+                        let kv_manager_guard = kv_manager_clone.lock().await;
 
                         // Process DirectRequests, converting them to ActiveSequence and scheduling them until we can't
                         // schedule anymore.
+                        let mut current_blocks = kv_manager_guard.num_active_blocks();
+                        let mut current_tokens = state_guard.num_batched_tokens();
                         while let Some((uuid, request)) = state_guard.next() {
                             let active_sequence = get_active_sequence(request, block_size);
 
-                            // Calculate token budget using new_tokens from PrefillCost
-                            let total_prefill_tokens = state_guard.num_batched_tokens();
-                            let tokens_budget = token_capacity.saturating_sub(total_prefill_tokens);
+                            // Update predictive budgets
+                            let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence);
+                            let new_blocks = prefill_cost.new_blocks;
+                            let new_tokens = prefill_cost.new_tokens;
+                            current_blocks += new_blocks;
+                            current_tokens += new_tokens;
 
                             // Check if it can be scheduled
-                            let Some(prefill_cost) = kv_manager_guard.try_schedule(&active_sequence, watermark, tokens_budget) else {
-                                state_guard.make_ready(uuid, active_sequence);
+                            let under_block_budget = current_blocks as f64 <= (1. - watermark) * kv_manager_guard.max_capacity() as f64;
+                            let under_token_budget = current_tokens <= max_num_batched_tokens;
+                            if under_block_budget && under_token_budget {
+                                state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost));
+                                should_schedule = false;
+                            } else {
+                                state_guard.first_in_line(uuid, Request::Active(active_sequence));
                                 break;
-                            };
-
-                            // Get creation signal and schedule the request
-                            let signal = state_guard.run(uuid, active_sequence);
-                            kv_manager_guard.process(&signal);
-                            state_guard.set_prefill_cost(uuid, Some(prefill_cost));
+                            }
                         }
                     }
 
                     // Check for cancellation
-                    _ = token_clone.cancelled() => {
+                    _ = cancel_token_clone.cancelled() => {
                         break;
                     }
 
@@ -285,44 +296,36 @@ impl Scheduler {
                         let decoding_time = -5.47 * active_perc.powi(2) + 43.88 * active_perc + 19.44;
                         let mut total_time = Duration::from_secs_f64(decoding_time / 1000.0);
 
-                        // Process each running request
-                        let uuids: Vec<Uuid> = state_guard.running.keys().cloned().collect();
-                        for uuid in uuids {
-                            // Check if UUID is still in running_requests, if not skip this iteration
-                            if !state_guard.running.contains(&uuid) {
-                                continue;
+                        // Process prefilling
+                        while let Some((prefill_compute, creation_signal)) = state_guard.start_decode() {
+                            // NOTE: Prefill cost/time is always incremented for new blocks, even if they
+                            // could be cached by other requests in the same batch. This matches vLLM behavior.
+                            total_time += Duration::from_secs_f64(prefill_compute / 1000.0);
+                            let prefill_success = process_signals(&mut kv_manager_guard, std::slice::from_ref(&creation_signal));
+                            if !prefill_success {
+                                panic!("Block allocation for prefilling cannot fail.");
                             }
+                        }
 
-                            // Get prefill compute value first
-                            let prefill_compute = state_guard.get_prefill_compute(&uuid).unwrap_or(0.);
-
-                            // Get the active sequence for this UUID
-                            let sequence = state_guard.requests.get_mut(&uuid)
-                                .and_then(|req| if let Request::Active(seq) = req { Some(seq) } else { None })
-                                .expect("UUID in running_requests must have a corresponding active sequence");
-
-                            // Generate token and get signals
+                        // Process decoding
+                        let uuids: Vec<Uuid> = state_guard.decode.keys().cloned().collect();
+                        if !uuids.is_empty() {should_schedule = true};
+                        for uuid in uuids {
+                            let Some(sequence) = state_guard.run(uuid) else {
+                                continue;
+                            };
                             let signals = sequence.generate();
 
                             // Process all signals with the KvManager
                             // Handling of preemption on failure
                             if !process_signals(&mut kv_manager_guard, &signals) {
                                 sequence.pop();  // revert the failed generation op
-
-                                // free_signal derefs the preempted blocks
-                                let Some(free_signal) = state_guard.preempt() else {
-                                    panic!("Failed to acquire signal to free KV blocks from preemption");
-                                };
-
-                                for signal in free_signal {
+                                for signal in state_guard.preempt() {
                                     kv_manager_guard.process(&signal);
                                 }
                                 continue;
                             }
 
-                            // Accumulate sleep duration based on prefill_compute if available
-                            total_time += Duration::from_secs_f64(prefill_compute / 1000.0);
-
                             // Send UUID notification for each generated token
                             if let Some(tx) = &output_tx_clone {
                                 let _ = tx.try_send(uuid);
@@ -333,11 +336,6 @@ impl Scheduler {
                                 state_guard.complete(&uuid);
                                 continue;
                             }
-
-                            // Transition to decode (no prefill cost)
-                            if sequence.generated_tokens() == 1 {
-                                state_guard.set_prefill_cost(uuid, None);
-                            }
                         }
 
                         // Sleep once for the adjusted duration
@@ -371,7 +369,7 @@ impl Scheduler {
     /// Get the count of running requests
     pub async fn running_count(&self) -> usize {
         let state = self.state.lock().await;
-        state.running.len()
+        state.decode.len()
     }
 
     /// Get the current capacity of the KvManager
@@ -397,7 +395,7 @@ impl Scheduler {
         };
 
         ForwardPassMetrics {
-            request_active_slots: state.running.len() as u64,
+            request_active_slots: state.decode.len() as u64,
             request_total_slots: 420, // Dummy value as specified
             kv_active_blocks: active_blocks_count,
             kv_total_blocks: total_capacity,
@@ -476,7 +474,6 @@ mod tests {
         std::env::set_var("RUST_LOG", "debug");
 
         let kv_capacity: usize = 500;
-        let watermark: f64 = 0.01; // 1% watermark
         let block_size: usize = 64;
         let num_requests: usize = 100;
         let input_len: usize = 1000;
@@ -488,8 +485,9 @@ mod tests {
         // Create scheduler with internal KvManager
         let scheduler = Scheduler::new(
             kv_capacity,
-            watermark,
             block_size,
+            None,
+            None,
             Some(10.0), // speedup_ratio
             Some(output_tx),
             None,
@@ -548,7 +546,6 @@ mod tests {
                 // Manual debug ticker that prints forward pass metrics
                 _ = debug_interval.tick() => {
                     let _metrics = scheduler.get_forward_pass_metrics().await;
-                    println!("Forward Pass Metrics: {:#?}", _metrics);
                 }
 
                 Some(_) = output_rx.recv() => {

From 394c2bf471c084a4322f902266c7b5d4a7e5e6bd Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Thu, 29 May 2025 17:39:41 -0700
Subject: [PATCH 05/36] restore printing out fwd pass metrics in test

---
 lib/llm/src/mocker/scheduler.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 5f6776a9b1..ddf83e4d11 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -546,6 +546,7 @@ mod tests {
                 // Manual debug ticker that prints forward pass metrics
                 _ = debug_interval.tick() => {
                     let _metrics = scheduler.get_forward_pass_metrics().await;
+                    println!("Forward Pass Metrics: {:#?}", _metrics);
                 }
 
                 Some(_) = output_rx.recv() => {

From dad183f81503c973b31c96dee3c75435b2e82747 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 11 Jun 2025 00:42:03 -0700
Subject: [PATCH 06/36] multi-dp mocker engine

---
 lib/llm/src/mocker.rs           |   1 +
 lib/llm/src/mocker/engine.rs    | 251 ++++++++++++++++++++++++++++++++
 lib/llm/src/mocker/protocols.rs |  38 +++++
 lib/llm/src/mocker/scheduler.rs |  83 ++++++-----
 4 files changed, 334 insertions(+), 39 deletions(-)
 create mode 100644 lib/llm/src/mocker/engine.rs

diff --git a/lib/llm/src/mocker.rs b/lib/llm/src/mocker.rs
index 2a9e63a9e2..4315868c49 100644
--- a/lib/llm/src/mocker.rs
+++ b/lib/llm/src/mocker.rs
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod engine;
 pub mod evictor;
 pub mod kv_manager;
 pub mod protocols;
diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
new file mode 100644
index 0000000000..ec70cd4ea2
--- /dev/null
+++ b/lib/llm/src/mocker/engine.rs
@@ -0,0 +1,251 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MockSchedulerEngine - AsyncEngine wrapper around the Scheduler
+//!
+//! This module provides an AsyncEngine implementation that wraps the Scheduler
+//! to provide streaming token generation with realistic timing simulation.
+
+use crate::mocker::protocols::{DirectRequest, MockEngineArgs, OutputSignal};
+use crate::mocker::scheduler::Scheduler;
+
+use dynamo_runtime::{
+    engine::AsyncEngineContextProvider,
+    pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn},
+    protocols::annotated::Annotated,
+};
+
+use rand::Rng;
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::{mpsc, Mutex};
+use tokio_stream::wrappers::ReceiverStream;
+use uuid::Uuid;
+
+/// Generate a random printable character
+fn generate_random_char() -> String {
+    let mut rng = rand::rng();
+    let selection = match rng.random_range(0..4) {
+        0 => ('a'..='z').nth(rng.random_range(0..26)).unwrap(), // lowercase
+        1 => ('A'..='Z').nth(rng.random_range(0..26)).unwrap(), // uppercase
+        2 => ('0'..='9').nth(rng.random_range(0..10)).unwrap(), // digits
+        _ => [' ', '.', ',', '!', '?'][rng.random_range(0..5)], // punctuation/space
+    };
+    selection.to_string()
+}
+
+/// AsyncEngine wrapper around the Scheduler that generates random character tokens
+pub struct MockVllmEngine {
+    schedulers: Vec<Scheduler>,
+    active_requests: Arc<Mutex<HashMap<Uuid, mpsc::Sender<OutputSignal>>>>,
+    dp_size: u32,
+}
+
+impl MockVllmEngine {
+    /// Create a new MockVllmEngine with the given parameters
+    pub fn new(args: MockEngineArgs) -> Self {
+        let mut schedulers = Vec::new();
+        let active_requests = Arc::new(Mutex::new(
+            HashMap::<Uuid, mpsc::Sender<OutputSignal>>::new(),
+        ));
+
+        // Create multiple schedulers and their background tasks
+        for _ in 0..args.dp_size {
+            // Create a shared output channel that this scheduler will use
+            let (output_tx, output_rx) = mpsc::channel::<OutputSignal>(1024);
+
+            let scheduler = Scheduler::new(
+                args.clone(),
+                Some(output_tx),
+                None, // No global cancellation token
+            );
+
+            schedulers.push(scheduler);
+
+            // Spawn a background task for this scheduler to distribute token notifications to active requests
+            let output_rx = Arc::new(Mutex::new(output_rx));
+            let active_requests_clone = active_requests.clone();
+
+            tokio::spawn(async move {
+                loop {
+                    let signal = {
+                        let mut rx = output_rx.lock().await;
+                        match rx.recv().await {
+                            Some(signal) => signal,
+                            None => break, // Channel closed
+                        }
+                    };
+
+                    // Notify the specific request that a token was generated
+                    let active = active_requests_clone.lock().await;
+                    if let Some(request_tx) = active.get(&signal.uuid) {
+                        let _ = request_tx.send(signal).await;
+                    }
+                }
+            });
+        }
+
+        Self {
+            schedulers,
+            active_requests,
+            dp_size: args.dp_size,
+        }
+    }
+}
+
+#[async_trait]
+impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for MockVllmEngine {
+    async fn generate(
+        &self,
+        input: SingleIn<DirectRequest>,
+    ) -> Result<ManyOut<Annotated<String>>, Error> {
+        let (mut request, ctx) = input.into_parts();
+
+        let dp_rank = request.dp_rank.unwrap_or(0);
+
+        // Validate dp_rank
+        if dp_rank >= self.dp_size {
+            return Err(Error::msg(format!(
+                "dp_rank {} is out of bounds for dp_size {}",
+                dp_rank, self.dp_size
+            )));
+        }
+
+        let request_uuid = ctx.id().parse().unwrap_or(Uuid::new_v4());
+        request.uuid = Some(request_uuid);
+
+        let (request_tx, mut request_rx) = mpsc::channel::<OutputSignal>(64);
+        {
+            let mut active = self.active_requests.lock().await;
+            active.insert(request_uuid, request_tx);
+        }
+
+        // Send the request to the appropriate scheduler based on dp_rank
+        self.schedulers[dp_rank as usize]
+            .receive(request.clone())
+            .await;
+
+        // Create a simple channel for the stream
+        let (stream_tx, stream_rx) = mpsc::channel::<Annotated<String>>(64);
+
+        let active_requests = self.active_requests.clone();
+        let async_context = ctx.context();
+
+        // Spawn a task to handle the complex async logic
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    Some(signal) = request_rx.recv() => {
+                        if signal.completed {
+                            break;
+                        }
+                        let output = generate_random_char();
+                        if stream_tx.send(Annotated::from_data(output)).await.is_err() {
+                            break;
+                        }
+                    }
+
+                    _ = async_context.stopped() => {
+                        break;
+                    }
+                }
+            }
+
+            // Clean up: remove this request from active requests
+            let mut active = active_requests.lock().await;
+            active.remove(&request_uuid);
+        });
+
+        // Create a simple ReceiverStream which is naturally Send + Sync
+        let stream = ReceiverStream::new(stream_rx);
+        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use dynamo_runtime::pipeline::Context;
+    use futures::StreamExt;
+
+    #[tokio::test]
+    async fn test_multiple_workers_with_token_limit() {
+        const DP_SIZE: u32 = 2;
+        const TOKENS_PER_REQUEST: usize = 20;
+
+        // Create the MockVllmEngine using builder pattern
+        let args = MockEngineArgs::builder()
+            .speedup_ratio(10.0)
+            .dp_size(DP_SIZE)
+            .build()
+            .unwrap();
+
+        let engine = MockVllmEngine::new(args);
+
+        // Create 4 DirectRequests: 2 for worker 0, 2 for worker 1
+        let requests = vec![
+            DirectRequest {
+                tokens: vec![1, 2, 3, 4],
+                max_output_tokens: TOKENS_PER_REQUEST,
+                uuid: None,
+                dp_rank: Some(0),
+            },
+            DirectRequest {
+                tokens: vec![5, 6, 7, 8],
+                max_output_tokens: TOKENS_PER_REQUEST,
+                uuid: None,
+                dp_rank: Some(0),
+            },
+            DirectRequest {
+                tokens: vec![9, 10, 11, 12],
+                max_output_tokens: TOKENS_PER_REQUEST,
+                uuid: None,
+                dp_rank: Some(1),
+            },
+            DirectRequest {
+                tokens: vec![13, 14, 15, 16],
+                max_output_tokens: TOKENS_PER_REQUEST,
+                uuid: None,
+                dp_rank: Some(1),
+            },
+        ];
+
+        // Generate streams and collect all tokens from each
+        for request in requests {
+            let ctx = Context::new(request);
+            let stream = engine.generate(ctx).await.unwrap();
+
+            let tokens: Vec<_> = stream.collect().await;
+
+            // Verify each stream produces exactly the expected number of tokens
+            assert_eq!(tokens.len(), TOKENS_PER_REQUEST);
+
+            // Verify all tokens contain valid data
+            for token in tokens {
+                assert!(token.data.is_some());
+            }
+        }
+
+        // Give a small delay to ensure cleanup tasks complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // Verify that active_requests is empty (all requests cleaned up)
+        let active_requests = engine.active_requests.lock().await;
+        assert!(
+            active_requests.is_empty(),
+            "Active requests should be empty after streams complete"
+        );
+    }
+}
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
index 51440ee9d4..fc66ce061b 100644
--- a/lib/llm/src/mocker/protocols.rs
+++ b/lib/llm/src/mocker/protocols.rs
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use derive_builder::Builder;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
@@ -52,6 +53,7 @@ pub struct DirectRequest {
     pub tokens: Vec<Token>,
     pub max_output_tokens: usize,
     pub uuid: Option<Uuid>,
+    pub dp_rank: Option<u32>,
 }
 
 /// Represents the cost of prefilling content in the cache
@@ -62,6 +64,42 @@ pub struct PrefillCost {
     pub prefill_compute: f64,
 }
 
+/// Signal for output token generation with completion status
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OutputSignal {
+    pub uuid: Uuid,
+    pub completed: bool,
+}
+
+/// Configuration arguments for MockVllmEngine
+#[derive(Debug, Clone, Serialize, Deserialize, Builder)]
+#[builder(pattern = "owned", build_fn(public))]
+pub struct MockEngineArgs {
+    #[builder(default = "16384")]
+    pub num_gpu_blocks: usize,
+
+    #[builder(default = "64")]
+    pub block_size: usize,
+
+    #[builder(default)]
+    pub max_num_batched_tokens: Option<usize>,
+
+    #[builder(default = "0.01")]
+    pub watermark: f64,
+
+    #[builder(default = "1.0")]
+    pub speedup_ratio: f64,
+
+    #[builder(default = "1")]
+    pub dp_size: u32,
+}
+
+impl MockEngineArgs {
+    pub fn builder() -> MockEngineArgsBuilder {
+        MockEngineArgsBuilder::default()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 094ed2910b..971e3288ba 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -43,8 +43,8 @@
 use crate::kv_router::protocols::ForwardPassMetrics;
 use crate::mocker::evictor::LRUEvictor;
 use crate::mocker::kv_manager::KvManager;
-use crate::mocker::protocols::DirectRequest;
-use crate::mocker::protocols::{MoveBlock, PrefillCost, UniqueBlock};
+use crate::mocker::protocols::{DirectRequest, MockEngineArgs};
+use crate::mocker::protocols::{MoveBlock, OutputSignal, PrefillCost, UniqueBlock};
 use crate::mocker::sequence::ActiveSequence;
 use std::collections::HashMap;
 use std::collections::VecDeque;
@@ -195,29 +195,22 @@ pub struct Scheduler {
 impl Scheduler {
     /// Create a new Scheduler with the given parameters
     pub fn new(
-        num_gpu_blocks: usize,
-        block_size: usize,
-        max_num_batched_tokens: Option<usize>,
-        watermark: Option<f64>,
-        speedup_ratio: Option<f64>,
-        output_tx: Option<mpsc::Sender<Uuid>>,
+        args: MockEngineArgs,
+        output_tx: Option<mpsc::Sender<OutputSignal>>,
         cancellation_token: Option<CancellationToken>,
     ) -> Self {
-        let max_num_batched_tokens = max_num_batched_tokens.unwrap_or(8192);
-        let watermark = watermark.unwrap_or(0.01);
-
         let state = Arc::new(Mutex::new(SchedulerState::default()));
-        let kv_manager = Arc::new(Mutex::new(KvManager::new(num_gpu_blocks, block_size)));
-
-        // Assert speedup_ratio is greater than 0 if provided
-        if let Some(ratio) = speedup_ratio {
-            assert!(
-                ratio > 0.0,
-                "speedup_ratio must be greater than 0, got: {}",
-                ratio
-            );
-        }
-        let speedup_ratio = speedup_ratio.unwrap_or(1.0);
+        let kv_manager = Arc::new(Mutex::new(KvManager::new(
+            args.num_gpu_blocks,
+            args.block_size,
+        )));
+
+        // Assert speedup_ratio is greater than 0
+        assert!(
+            args.speedup_ratio > 0.0,
+            "speedup_ratio must be greater than 0, got: {}",
+            args.speedup_ratio
+        );
 
         // Create channel for request handling
         let (request_tx, mut request_rx) = mpsc::channel::<DirectRequest>(1024);
@@ -259,7 +252,7 @@ impl Scheduler {
                         let mut current_blocks = kv_manager_guard.num_active_blocks();
                         let mut current_tokens = state_guard.num_batched_tokens();
                         while let Some((uuid, request)) = state_guard.next() {
-                            let active_sequence = get_active_sequence(request, block_size);
+                            let active_sequence = get_active_sequence(request, args.block_size);
 
                             // Update predictive budgets
                             let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence);
@@ -269,8 +262,8 @@ impl Scheduler {
                             current_tokens += new_tokens;
 
                             // Check if it can be scheduled
-                            let under_block_budget = current_blocks as f64 <= (1. - watermark) * kv_manager_guard.max_capacity() as f64;
-                            let under_token_budget = current_tokens <= max_num_batched_tokens;
+                            let under_block_budget = current_blocks as f64 <= (1. - args.watermark) * kv_manager_guard.max_capacity() as f64;
+                            let under_token_budget = args.max_num_batched_tokens.is_none_or(|limit| current_tokens <= limit);
                             if under_block_budget && under_token_budget {
                                 state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost));
                                 should_schedule = false;
@@ -328,18 +321,29 @@ impl Scheduler {
 
                             // Send UUID notification for each generated token
                             if let Some(tx) = &output_tx_clone {
-                                let _ = tx.try_send(uuid);
+                                let signal = OutputSignal {
+                                    uuid,
+                                    completed: false,
+                                };
+                                let _ = tx.try_send(signal);
                             }
 
                             // Check if we're done after generating
                             if sequence.generated_tokens() >= sequence.max_output_tokens() {
+                                if let Some(tx) = &output_tx_clone {
+                                    let signal = OutputSignal {
+                                        uuid,
+                                        completed: true,
+                                    };
+                                    let _ = tx.try_send(signal);
+                                }
                                 state_guard.complete(&uuid);
                                 continue;
                             }
                         }
 
                         // Sleep once for the adjusted duration
-                        let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / speedup_ratio);
+                        let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / args.speedup_ratio);
                         if adjusted_time.as_millis() > 0 {
                             tokio::time::sleep(adjusted_time).await;
                         }
@@ -481,18 +485,18 @@ mod tests {
         let max_output_tokens: usize = 100;
 
         // Create channel for token output
-        let (output_tx, mut output_rx) = mpsc::channel::<Uuid>(1024);
-
-        // Create scheduler with internal KvManager
-        let scheduler = Scheduler::new(
-            kv_capacity,
-            block_size,
-            None,
-            None,
-            Some(10.0), // speedup_ratio
-            Some(output_tx),
-            None,
-        );
+        let (output_tx, mut output_rx) = mpsc::channel::<OutputSignal>(1024);
+
+        // Create scheduler args using builder
+        let args = MockEngineArgs::builder()
+            .num_gpu_blocks(kv_capacity)
+            .block_size(block_size)
+            .speedup_ratio(10.0)
+            .build()
+            .unwrap();
+
+        // Create scheduler with new args struct
+        let scheduler = Scheduler::new(args, Some(output_tx), None);
 
         // Create shared tokens for caching case
         let shared_tokens = if use_shared_tokens {
@@ -523,6 +527,7 @@ mod tests {
                 tokens: input_tokens,
                 max_output_tokens,
                 uuid: None,
+                dp_rank: None,
             };
             scheduler.receive(request).await;
         }

From 009ec7874d33aa3e5de3f1b3df25b9803970357d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 11 Jun 2025 19:25:43 -0700
Subject: [PATCH 07/36] fixed prefill cost, and more conservative watermarking

---
 lib/llm/src/mocker/engine.rs     |   3 +-
 lib/llm/src/mocker/kv_manager.rs |   3 +-
 lib/llm/src/mocker/protocols.rs  |   3 +-
 lib/llm/src/mocker/scheduler.rs  | 163 ++++++++++++++++++++++++++++---
 4 files changed, 155 insertions(+), 17 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index ec70cd4ea2..21abaf3a1e 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -62,12 +62,13 @@ impl MockVllmEngine {
         ));
 
         // Create multiple schedulers and their background tasks
-        for _ in 0..args.dp_size {
+        for dp_rank in 0..args.dp_size {
             // Create a shared output channel that this scheduler will use
             let (output_tx, output_rx) = mpsc::channel::<OutputSignal>(1024);
 
             let scheduler = Scheduler::new(
                 args.clone(),
+                Some(dp_rank),
                 Some(output_tx),
                 None, // No global cancellation token
             );
diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index 71522dcb58..ca2d6fc50a 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -178,7 +178,8 @@ impl KvManager {
     pub fn probe_new_blocks(&self, blocks: &[UniqueBlock]) -> usize {
         blocks
             .iter()
-            .filter(|&block| !self.active_blocks.contains_key(block))
+            // .filter(|&block| !self.active_blocks.contains_key(block))
+            .filter(|&block| !self.all_blocks.contains(block))
             .count()
     }
 
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
index fc66ce061b..4f59e2d8dc 100644
--- a/lib/llm/src/mocker/protocols.rs
+++ b/lib/llm/src/mocker/protocols.rs
@@ -81,7 +81,8 @@ pub struct MockEngineArgs {
     #[builder(default = "64")]
     pub block_size: usize,
 
-    #[builder(default)]
+    // default for open api server, for llm class it's 16384
+    #[builder(default = Some(8192))]
     pub max_num_batched_tokens: Option<usize>,
 
     #[builder(default = "0.01")]
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 971e3288ba..5749db730a 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -178,6 +178,10 @@ impl SchedulerState {
             panic!("Expected ActiveSequence in running queue")
         };
         let signals = active_sequence.reset_with_signal();
+
+        // Note: For preemption, we don't compute hit rate since we don't have access to new_tokens
+        // and the sequence is being reset anyway. Hit rate tracking is primarily for new scheduling attempts.
+
         self.first_in_line(uuid, Request::Active(active_sequence));
 
         signals
@@ -187,15 +191,18 @@ impl SchedulerState {
 /// Manages scheduling of requests using KvManager resources
 #[derive(Clone)]
 pub struct Scheduler {
+    dp_rank: Option<u32>,
     state: Arc<Mutex<SchedulerState>>,
     kv_manager: Arc<Mutex<KvManager>>,
     request_tx: mpsc::Sender<DirectRequest>,
+    hit_rates: Arc<Mutex<VecDeque<f32>>>,
 }
 
 impl Scheduler {
     /// Create a new Scheduler with the given parameters
     pub fn new(
         args: MockEngineArgs,
+        dp_rank: Option<u32>,
         output_tx: Option<mpsc::Sender<OutputSignal>>,
         cancellation_token: Option<CancellationToken>,
     ) -> Self {
@@ -204,6 +211,7 @@ impl Scheduler {
             args.num_gpu_blocks,
             args.block_size,
         )));
+        let hit_rates = Arc::new(Mutex::new(VecDeque::with_capacity(1000)));
 
         // Assert speedup_ratio is greater than 0
         assert!(
@@ -220,6 +228,7 @@ impl Scheduler {
         let kv_manager_clone = kv_manager.clone();
         let output_tx_clone = output_tx.clone();
         let cancel_token_clone = cancellation_token.unwrap_or_default().clone();
+        let hit_rates_clone = hit_rates.clone();
 
         // Spawn main background task with cancellation token
         tokio::spawn(async move {
@@ -256,7 +265,8 @@ impl Scheduler {
 
                             // Update predictive budgets
                             let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence);
-                            let new_blocks = prefill_cost.new_blocks;
+                            let new_tokens = active_sequence.len();
+                            let new_blocks = (new_tokens + 1) / args.block_size;  // this is conservative, assumes no cache hit
                             let new_tokens = prefill_cost.new_tokens;
                             current_blocks += new_blocks;
                             current_tokens += new_tokens;
@@ -264,13 +274,27 @@ impl Scheduler {
                             // Check if it can be scheduled
                             let under_block_budget = current_blocks as f64 <= (1. - args.watermark) * kv_manager_guard.max_capacity() as f64;
                             let under_token_budget = args.max_num_batched_tokens.is_none_or(|limit| current_tokens <= limit);
-                            if under_block_budget && under_token_budget {
-                                state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost));
-                                should_schedule = false;
-                            } else {
+
+                            // Cannot schedule, put first in line instead
+                            if !(under_block_budget && under_token_budget) {
                                 state_guard.first_in_line(uuid, Request::Active(active_sequence));
                                 break;
                             }
+
+                            // Compute and store hit rate
+                            let hit_rate = (!active_sequence.is_empty())
+                                .then(|| 1.0 - (new_tokens as f32 / active_sequence.len() as f32))
+                                .unwrap_or(0.0);
+                            {
+                                let mut hit_rates_guard = hit_rates_clone.lock().await;
+                                hit_rates_guard.push_back(hit_rate);
+                                if hit_rates_guard.len() > 1000 {
+                                    hit_rates_guard.pop_front();
+                                }
+                            }
+
+                            state_guard.start_prefill(uuid, active_sequence, Some(prefill_cost));
+                            should_schedule = false;
                         }
                     }
 
@@ -343,6 +367,8 @@ impl Scheduler {
                         }
 
                         // Sleep once for the adjusted duration
+                        drop(kv_manager_guard);
+                        drop(state_guard);
                         let adjusted_time = Duration::from_secs_f64(total_time.as_secs_f64() / args.speedup_ratio);
                         if adjusted_time.as_millis() > 0 {
                             tokio::time::sleep(adjusted_time).await;
@@ -353,9 +379,11 @@ impl Scheduler {
         });
 
         Self {
+            dp_rank,
             state,
             kv_manager,
             request_tx,
+            hit_rates,
         }
     }
 
@@ -384,30 +412,44 @@ impl Scheduler {
 
     /// Returns forward pass metrics for monitoring purposes
     pub async fn get_forward_pass_metrics(&self) -> ForwardPassMetrics {
+        // Acquire all locks in consistent order: state -> kv_manager -> hit_rates
         let state = self.state.lock().await;
         let kv_manager = self.kv_manager.lock().await;
+        let hit_rates_guard = self.hit_rates.lock().await;
 
-        // Get the active blocks and total capacity from KvManager
+        // Get state metrics
+        let request_active_slots = state.decode.len() as u64;
+        let num_requests_waiting = state.waiting.len() as u64;
+
+        // Get KV manager metrics
         let active_blocks_count = kv_manager.active_blocks().len() as u64;
         let total_capacity = kv_manager.max_capacity() as u64;
-
-        // Calculate GPU cache usage percentage
         let gpu_cache_usage_perc = if total_capacity > 0 {
             active_blocks_count as f32 / total_capacity as f32
         } else {
             0.0
         };
 
+        // Get hit rate metrics
+        let gpu_prefix_cache_hit_rate = if hit_rates_guard.is_empty() {
+            0.0
+        } else {
+            let sum: f32 = hit_rates_guard.iter().sum();
+            sum / hit_rates_guard.len() as f32
+        };
+
         ForwardPassMetrics {
-            data_parallel_rank: None, // Default for backwards compatibility
-            request_active_slots: state.decode.len() as u64,
-            request_total_slots: 420, // Dummy value as specified
+            data_parallel_rank: self.dp_rank,
+            request_active_slots,
+            // vllm max_num_seqs for gpu >= 70 vram, otherwise 256, fallback is 128
+            request_total_slots: 1024,
             kv_active_blocks: active_blocks_count,
             kv_total_blocks: total_capacity,
-            num_requests_waiting: state.waiting.len() as u64,
+            num_requests_waiting,
             gpu_cache_usage_perc,
-            gpu_prefix_cache_hit_rate: 0.0, // Placeholder value as specified
+            gpu_prefix_cache_hit_rate,
         }
+        // Guards drop naturally here in reverse order (LIFO): hit_rates_guard, kv_manager, state
     }
 }
 
@@ -496,7 +538,7 @@ mod tests {
             .unwrap();
 
         // Create scheduler with new args struct
-        let scheduler = Scheduler::new(args, Some(output_tx), None);
+        let scheduler = Scheduler::new(args, None, Some(output_tx), None);
 
         // Create shared tokens for caching case
         let shared_tokens = if use_shared_tokens {
@@ -588,4 +630,97 @@ mod tests {
             expected_tokens
         );
     }
+
+    #[tokio::test]
+    async fn test_cache_hit_rate_with_identical_requests() {
+        let block_size: usize = 64;
+        let max_output_tokens: usize = 10;
+        let speedup_ratio = 10.0;
+        let num_requests = 10;
+        let token_length = 65;
+
+        // Create channel for token output
+        let (output_tx, mut output_rx) = mpsc::channel::<OutputSignal>(1024);
+
+        // Create scheduler args
+        let args = MockEngineArgs::builder()
+            .num_gpu_blocks(1000) // Large enough to not be a constraint
+            .block_size(block_size)
+            .speedup_ratio(speedup_ratio)
+            .build()
+            .unwrap();
+
+        // Create scheduler
+        let scheduler = Scheduler::new(args, None, Some(output_tx), None);
+
+        // Create identical tokens for all requests
+        let identical_tokens: Vec<u32> = (0..token_length).map(|i| i as u32).collect();
+
+        // Send all requests with identical tokens
+        for _ in 0..num_requests {
+            let request = DirectRequest {
+                tokens: identical_tokens.clone(),
+                max_output_tokens,
+                uuid: None,
+                dp_rank: None,
+            };
+            scheduler.receive(request).await;
+            // Sleep for 0.1 second after each request
+            tokio::time::sleep(Duration::from_millis(100)).await;
+        }
+
+        // Collect all generated tokens
+        let mut received_tokens = 0;
+
+        // Set up a timeout that resets to 0.5 seconds on each received token
+        let timeout = tokio::time::sleep(Duration::from_millis(500));
+        tokio::pin!(timeout);
+
+        // Set up debug ticker interval
+        let mut debug_interval = interval(Duration::from_millis(500));
+
+        loop {
+            tokio::select! {
+                biased;
+
+                // Manual debug ticker that prints forward pass metrics
+                _ = debug_interval.tick() => {
+                    let _metrics = scheduler.get_forward_pass_metrics().await;
+                    println!("Forward Pass Metrics: {:#?}", _metrics);
+                }
+
+                Some(_signal) = output_rx.recv() => {
+                    received_tokens += 1;
+                    // Reset timeout whenever we receive a token
+                    timeout.set(tokio::time::sleep(Duration::from_millis(500)));
+                }
+
+                _ = &mut timeout => {
+                    // Break when timeout occurs (no more tokens for 0.5 seconds)
+                    break;
+                }
+            }
+        }
+
+        // Verify forward pass metrics
+        let metrics = scheduler.get_forward_pass_metrics().await;
+
+        assert_eq!(
+            metrics.num_requests_waiting, 0,
+            "Expected no waiting requests, got {}",
+            metrics.num_requests_waiting
+        );
+
+        assert!(
+            metrics.gpu_prefix_cache_hit_rate > 0.8,
+            "Expected cache hit rate > 0.8, got {}",
+            metrics.gpu_prefix_cache_hit_rate
+        );
+
+        println!(
+            "Test passed! Cache hit rate: {:.3}",
+            metrics.gpu_prefix_cache_hit_rate
+        );
+        println!("Received {} tokens", received_tokens);
+    }
 }

From ee11427f2159c5914906dffccb778ab0740da724 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Wed, 11 Jun 2025 23:55:14 -0700
Subject: [PATCH 08/36] fwd pass metrics

---
 lib/llm/src/mocker/engine.rs    | 128 +++++++++++++++++++++++++++-----
 lib/llm/src/mocker/scheduler.rs |   6 +-
 2 files changed, 111 insertions(+), 23 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 21abaf3a1e..9df68b6f52 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -18,19 +18,24 @@
 //! This module provides an AsyncEngine implementation that wraps the Scheduler
 //! to provide streaming token generation with realistic timing simulation.
 
+use crate::kv_router::publisher::WorkerMetricsPublisher;
 use crate::mocker::protocols::{DirectRequest, MockEngineArgs, OutputSignal};
 use crate::mocker::scheduler::Scheduler;
+use tokio_util::sync::CancellationToken;
 
 use dynamo_runtime::{
+    component::Component,
     engine::AsyncEngineContextProvider,
     pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn},
     protocols::annotated::Annotated,
+    Result,
 };
 
 use rand::Rng;
 use std::collections::HashMap;
 use std::sync::Arc;
 use tokio::sync::{mpsc, Mutex};
+use tokio::time::{interval, Duration};
 use tokio_stream::wrappers::ReceiverStream;
 use uuid::Uuid;
 
@@ -51,16 +56,47 @@ pub struct MockVllmEngine {
     schedulers: Vec<Scheduler>,
     active_requests: Arc<Mutex<HashMap<Uuid, mpsc::Sender<OutputSignal>>>>,
     dp_size: u32,
+    cancel_token: CancellationToken,
 }
 
 impl MockVllmEngine {
     /// Create a new MockVllmEngine with the given parameters
-    pub fn new(args: MockEngineArgs) -> Self {
-        let mut schedulers = Vec::new();
+    pub async fn new(
+        args: MockEngineArgs,
+        component: Option<Component>,
+        cancel_token: Option<CancellationToken>,
+    ) -> Result<Self> {
         let active_requests = Arc::new(Mutex::new(
             HashMap::<Uuid, mpsc::Sender<OutputSignal>>::new(),
         ));
 
+        let cancel_token = cancel_token.unwrap_or_default();
+
+        // Create schedulers and start their background tasks
+        let schedulers =
+            Self::start_schedulers(args.clone(), active_requests.clone(), cancel_token.clone());
+
+        // Start metrics publishing tasks
+        Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?;
+
+        let engine = Self {
+            schedulers,
+            active_requests,
+            dp_size: args.dp_size,
+            cancel_token,
+        };
+
+        Ok(engine)
+    }
+
+    /// Create schedulers and spawn their background tasks for distributing token notifications
+    fn start_schedulers(
+        args: MockEngineArgs,
+        active_requests: Arc<Mutex<HashMap<Uuid, mpsc::Sender<OutputSignal>>>>,
+        cancel_token: CancellationToken,
+    ) -> Vec<Scheduler> {
+        let mut schedulers = Vec::new();
+
         // Create multiple schedulers and their background tasks
         for dp_rank in 0..args.dp_size {
             // Create a shared output channel that this scheduler will use
@@ -70,7 +106,7 @@ impl MockVllmEngine {
                 args.clone(),
                 Some(dp_rank),
                 Some(output_tx),
-                None, // No global cancellation token
+                Some(cancel_token.clone()),
             );
 
             schedulers.push(scheduler);
@@ -78,31 +114,80 @@ impl MockVllmEngine {
             // Spawn a background task for this scheduler to distribute token notifications to active requests
             let output_rx = Arc::new(Mutex::new(output_rx));
             let active_requests_clone = active_requests.clone();
+            let cancel_token_cloned = cancel_token.clone();
 
             tokio::spawn(async move {
                 loop {
-                    let signal = {
-                        let mut rx = output_rx.lock().await;
-                        match rx.recv().await {
-                            Some(signal) => signal,
-                            None => break, // Channel closed
+                    tokio::select! {
+                        signal_result = async {
+                            let mut rx = output_rx.lock().await;
+                            rx.recv().await
+                        } => {
+                            let Some(signal) = signal_result else {
+                                break; // Channel closed
+                            };
+
+                            // Notify the specific request that a token was generated
+                            let active = active_requests_clone.lock().await;
+                            if let Some(request_tx) = active.get(&signal.uuid) {
+                                let _ = request_tx.send(signal).await;
+                            }
+                        }
+                        _ = cancel_token_cloned.cancelled() => {
+                            break;
                         }
-                    };
-
-                    // Notify the specific request that a token was generated
-                    let active = active_requests_clone.lock().await;
-                    if let Some(request_tx) = active.get(&signal.uuid) {
-                        let _ = request_tx.send(signal).await;
                     }
                 }
             });
         }
 
-        Self {
-            schedulers,
-            active_requests,
-            dp_size: args.dp_size,
+        schedulers
+    }
+
+    /// Start background tasks to poll and publish metrics every second
+    async fn start_metrics_publishing(
+        schedulers: &[Scheduler],
+        component: Option<Component>,
+        cancel_token: CancellationToken,
+    ) -> Result<()> {
+        let metrics_publisher = Arc::new(WorkerMetricsPublisher::new()?);
+
+        if let Some(comp) = component {
+            metrics_publisher.create_endpoint(comp).await?;
+        }
+
+        for (dp_rank, scheduler) in schedulers.iter().enumerate() {
+            let scheduler = scheduler.clone();
+            let publisher = metrics_publisher.clone();
+            let dp_rank = dp_rank as u32;
+            let cancel_token = cancel_token.clone();
+
+            tokio::spawn(async move {
+                let mut interval = interval(Duration::from_secs(1));
+
+                loop {
+                    tokio::select! {
+                        _ = interval.tick() => {
+                            // Get metrics from scheduler
+                            let metrics = scheduler.get_forward_pass_metrics().await;
+
+                            // Publish metrics
+                            if let Err(e) = publisher.publish(Arc::new(metrics)) {
+                                tracing::warn!("Failed to publish metrics for DP rank {}: {}", dp_rank, e);
+                            } else {
+                                tracing::trace!("Published metrics for DP rank {}", dp_rank);
+                            }
+                        }
+                        _ = cancel_token.cancelled() => {
+                            tracing::info!("Metrics publishing cancelled for DP rank {}", dp_rank);
+                            break;
+                        }
+                    }
+                }
+            });
         }
+
+        Ok(())
     }
 }
 
@@ -143,6 +228,7 @@ impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for
 
         let active_requests = self.active_requests.clone();
         let async_context = ctx.context();
+        let cancel_token = self.cancel_token.clone();
 
         // Spawn a task to handle the complex async logic
         tokio::spawn(async move {
@@ -161,6 +247,10 @@ impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for
                     _ = async_context.stopped() => {
                         break;
                     }
+
+                    _ = cancel_token.cancelled() => {
+                        break;
+                    }
                 }
             }
 
@@ -193,7 +283,7 @@ mod tests {
             .build()
             .unwrap();
 
-        let engine = MockVllmEngine::new(args);
+        let engine = MockVllmEngine::new(args, None, None).await.unwrap();
 
         // Create 4 DirectRequests: 2 for worker 0, 2 for worker 1
         let requests = vec![
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 5749db730a..368d21f55b 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -282,9 +282,7 @@ impl Scheduler {
                             }
 
                             // Compute and store hit rate
-                            let hit_rate = (!active_sequence.is_empty())
-                                .then(|| 1.0 - (new_tokens as f32 / active_sequence.len() as f32))
-                                .unwrap_or(0.0);
+                            let hit_rate = if !active_sequence.is_empty() { 1.0 - (new_tokens as f32 / active_sequence.len() as f32) } else { 0.0 };
                             {
                                 let mut hit_rates_guard = hit_rates_clone.lock().await;
                                 hit_rates_guard.push_back(hit_rate);
@@ -644,7 +642,7 @@ mod tests {
 
         // Create scheduler args
         let args = MockEngineArgs::builder()
-            .num_gpu_blocks(1000) // Large enough to not be a constraint
+            .num_gpu_blocks(100) // Large enough to not be a constraint
             .block_size(block_size)
             .speedup_ratio(speedup_ratio)
             .build()

From 8e8d0b4cec406e506147f94bbd490f884ff50493 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Fri, 13 Jun 2025 02:06:40 -0700
Subject: [PATCH 09/36] can emit kv event, not tested

---
 lib/llm/src/mocker/engine.rs     |  1 +
 lib/llm/src/mocker/kv_manager.rs | 80 +++++++++++++++++++++++----
 lib/llm/src/mocker/protocols.rs  | 43 +++++++++++++--
 lib/llm/src/mocker/scheduler.rs  | 41 +++++++++++---
 lib/llm/src/mocker/sequence.rs   | 94 ++++++++++++++++++++++----------
 5 files changed, 208 insertions(+), 51 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 9df68b6f52..914a6c1ad0 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -106,6 +106,7 @@ impl MockVllmEngine {
                 args.clone(),
                 Some(dp_rank),
                 Some(output_tx),
+                None,
                 Some(cancel_token.clone()),
             );
 
diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index ca2d6fc50a..37721db04e 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -46,10 +46,11 @@
 //! implementation of the main block manager.
 
 use crate::mocker::evictor::LRUEvictor;
-use crate::mocker::protocols::{MoveBlock, PrefillCost, UniqueBlock};
+use crate::mocker::protocols::{MoveBlock, MoveBlockResponse, PrefillCost, UniqueBlock};
 use crate::mocker::sequence::ActiveSequence;
 use derive_getters::Getters;
 use std::collections::{HashMap, HashSet};
+use tokio::sync::mpsc;
 
 #[derive(Getters)]
 pub struct KvManager {
@@ -64,10 +65,20 @@ pub struct KvManager {
     inactive_blocks: LRUEvictor<UniqueBlock>,
 
     all_blocks: HashSet<UniqueBlock>,
+
+    move_block_response_tx: Option<mpsc::UnboundedSender<MoveBlockResponse>>,
 }
 
 impl KvManager {
     pub fn new(max_capacity: usize, block_size: usize) -> Self {
+        Self::new_with_sender(max_capacity, block_size, None)
+    }
+
+    pub fn new_with_sender(
+        max_capacity: usize,
+        block_size: usize,
+        move_block_response_tx: Option<mpsc::UnboundedSender<MoveBlockResponse>>,
+    ) -> Self {
         let active_blocks = HashMap::new();
         let inactive_blocks = LRUEvictor::default();
         let all_blocks = HashSet::new();
@@ -78,13 +89,39 @@ impl KvManager {
             active_blocks,
             inactive_blocks,
             all_blocks,
+            move_block_response_tx,
+        }
+    }
+
+    /// Utility method to send block responses with optional reversing
+    fn send_block_response(
+        &self,
+        mut blocks: Vec<u64>,
+        reverse: bool,
+        store: bool,
+        parent_hash: Option<u64>,
+    ) {
+        if let Some(ref tx) = self.move_block_response_tx {
+            if !blocks.is_empty() {
+                if reverse {
+                    blocks.reverse();
+                }
+                let response = if store {
+                    MoveBlockResponse::Store(blocks, parent_hash)
+                } else {
+                    MoveBlockResponse::Remove(blocks)
+                };
+                tx.send(response).unwrap();
+            }
         }
     }
 
     /// Process a MoveBlock instruction synchronously
     pub fn process(&mut self, event: &MoveBlock) -> bool {
         match event {
-            MoveBlock::Use(hashes, _) => {
+            MoveBlock::Use(hashes) => {
+                let mut blocks_stored = Vec::<u64>::new();
+
                 for hash in hashes {
                     // First check if it already exists in active blocks
                     if let Some(ref_count) = self.active_blocks.get_mut(hash) {
@@ -106,30 +143,47 @@ impl KvManager {
 
                     // If at max capacity, evict the oldest entry from inactive blocks
                     if active_count + inactive_count >= self.max_capacity {
-                        if let Some(evicted) = self.inactive_blocks.evict() {
-                            // Remove evicted block from all_blocks
-                            self.all_blocks.remove(&evicted);
-                        } else {
-                            // Cannot evict block, meaning no free blocks left in inactive pool
-                            // Send a signal, scheduler would expect to handle preemption upon receiving this
+                        let Some(evicted) = self.inactive_blocks.evict() else {
                             return false;
+                        };
+                        self.all_blocks.remove(&evicted);
+                        if let UniqueBlock::FullBlock(evicted_full_block) = evicted {
+                            self.send_block_response(vec![evicted_full_block], false, false, None);
                         }
                     }
 
                     // Now insert the new block in active blocks with reference count 1
                     self.active_blocks.insert(hash.clone(), 1);
-                    // Add to all_blocks as it's a new block
                     self.all_blocks.insert(hash.clone());
+                    if self.move_block_response_tx.is_some() {
+                        if let UniqueBlock::FullBlock(stored_full_block) = hash {
+                            blocks_stored.push(*stored_full_block);
+                        }
+                    }
                 }
+                self.send_block_response(blocks_stored, false, true, None);
             }
+
             MoveBlock::Destroy(hashes) => {
+                let mut blocks_destroyed = Vec::<u64>::new();
+
                 // Loop in inverse direction
                 for hash in hashes.iter().rev() {
                     self.active_blocks.remove(hash).unwrap();
                     // Remove from all_blocks when destroyed
                     assert!(self.all_blocks.remove(hash));
+
+                    // Track blocks for batch sending
+                    if self.move_block_response_tx.is_some() {
+                        if let UniqueBlock::FullBlock(destroyed_full_block) = hash {
+                            blocks_destroyed.push(*destroyed_full_block);
+                        }
+                    }
                 }
+
+                self.send_block_response(blocks_destroyed, true, false, None);
             }
+
             MoveBlock::Deref(hashes) => {
                 // Loop in inverse direction
                 for hash in hashes.iter().rev() {
@@ -149,7 +203,8 @@ impl KvManager {
                     }
                 }
             }
-            MoveBlock::Promote(uuid, hash) => {
+
+            MoveBlock::Promote(uuid, hash, parent_hash) => {
                 let uuid_block = UniqueBlock::PartialBlock(*uuid);
                 let hash_block = UniqueBlock::FullBlock(*hash);
 
@@ -167,6 +222,7 @@ impl KvManager {
                 // Update all_blocks
                 assert!(self.all_blocks.remove(&uuid_block));
                 self.all_blocks.insert(hash_block);
+                self.send_block_response(vec![*hash], false, true, *parent_hash);
             }
         }
 
@@ -252,7 +308,7 @@ mod tests {
         // Helper function to use multiple blocks that returns the response
         fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) -> bool {
             let blocks = ids.into_iter().map(UniqueBlock::FullBlock).collect();
-            manager.process(&MoveBlock::Use(blocks, None))
+            manager.process(&MoveBlock::Use(blocks))
         }
 
         // First use 10 blocks (0 to 9) in a batch
@@ -279,7 +335,7 @@ mod tests {
         // Helper function to use multiple blocks
         fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) {
             let blocks = ids.into_iter().map(UniqueBlock::FullBlock).collect();
-            manager.process(&MoveBlock::Use(blocks, None));
+            manager.process(&MoveBlock::Use(blocks));
         }
 
         // Helper function to destroy multiple blocks
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
index 4f59e2d8dc..7fd8895594 100644
--- a/lib/llm/src/mocker/protocols.rs
+++ b/lib/llm/src/mocker/protocols.rs
@@ -17,9 +17,12 @@ use derive_builder::Builder;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
+use crate::kv_router::protocols::{
+    ExternalSequenceBlockHash, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData,
+    KvCacheStoredBlockData, LocalBlockHash,
+};
+
 pub type Token = u32;
-pub type LocalBlockHash = u64;
-/// A global hash identifier for blocks
 pub type GlobalHash = u64;
 pub type NumBlocks = usize;
 
@@ -40,12 +43,19 @@ impl Default for UniqueBlock {
 }
 
 /// Represents different block movement operations in the cache
+/// For Use and Promote variants, parent hash is the second field
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub enum MoveBlock {
-    Use(Vec<UniqueBlock>, Option<f64>),
+    Use(Vec<UniqueBlock>),
     Destroy(Vec<UniqueBlock>),
     Deref(Vec<UniqueBlock>),
-    Promote(Uuid, GlobalHash),
+    Promote(Uuid, GlobalHash, Option<u64>),
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub enum MoveBlockResponse {
+    Store(Vec<GlobalHash>, Option<u64>),
+    Remove(Vec<GlobalHash>),
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -101,6 +111,31 @@ impl MockEngineArgs {
     }
 }
 
+/// Note: This assumes block_hash and tokens_hash are the same, which is not correct in rare cases
+/// where the sequence-aware hash differs from the token content hash.
+pub fn block_response_to_kv_event(response: MoveBlockResponse) -> KvCacheEventData {
+    match response {
+        MoveBlockResponse::Store(full_blocks, parent_hash) => {
+            KvCacheEventData::Stored(KvCacheStoreData {
+                parent_hash: parent_hash.map(ExternalSequenceBlockHash),
+                blocks: full_blocks
+                    .into_iter()
+                    .map(|block| KvCacheStoredBlockData {
+                        block_hash: ExternalSequenceBlockHash(block),
+                        tokens_hash: LocalBlockHash(block),
+                    })
+                    .collect(),
+            })
+        }
+        MoveBlockResponse::Remove(full_blocks) => KvCacheEventData::Removed(KvCacheRemoveData {
+            block_hashes: full_blocks
+                .into_iter()
+                .map(ExternalSequenceBlockHash)
+                .collect(),
+        }),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 368d21f55b..a4ca6b830e 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -40,11 +40,13 @@
 //! ## NOTE
 //! The current prefill and decoding time simulations are not scientific at all and are WIP
 
-use crate::kv_router::protocols::ForwardPassMetrics;
+use crate::kv_router::protocols::{ForwardPassMetrics, KvCacheEventData};
 use crate::mocker::evictor::LRUEvictor;
 use crate::mocker::kv_manager::KvManager;
-use crate::mocker::protocols::{DirectRequest, MockEngineArgs};
-use crate::mocker::protocols::{MoveBlock, OutputSignal, PrefillCost, UniqueBlock};
+use crate::mocker::protocols::{
+    block_response_to_kv_event, MoveBlock, OutputSignal, PrefillCost, UniqueBlock,
+};
+use crate::mocker::protocols::{DirectRequest, MockEngineArgs, MoveBlockResponse};
 use crate::mocker::sequence::ActiveSequence;
 use std::collections::HashMap;
 use std::collections::VecDeque;
@@ -204,12 +206,23 @@ impl Scheduler {
         args: MockEngineArgs,
         dp_rank: Option<u32>,
         output_tx: Option<mpsc::Sender<OutputSignal>>,
+        kv_events_tx: Option<mpsc::Sender<KvCacheEventData>>,
         cancellation_token: Option<CancellationToken>,
     ) -> Self {
         let state = Arc::new(Mutex::new(SchedulerState::default()));
-        let kv_manager = Arc::new(Mutex::new(KvManager::new(
+
+        // Create internal channel for KV events only if needed
+        let (block_resp_tx, mut block_resp_rx) = if kv_events_tx.is_some() {
+            let (tx, rx) = mpsc::unbounded_channel::<MoveBlockResponse>();
+            (Some(tx), Some(rx))
+        } else {
+            (None, None)
+        };
+
+        let kv_manager = Arc::new(Mutex::new(KvManager::new_with_sender(
             args.num_gpu_blocks,
             args.block_size,
+            block_resp_tx,
         )));
         let hit_rates = Arc::new(Mutex::new(VecDeque::with_capacity(1000)));
 
@@ -320,6 +333,13 @@ impl Scheduler {
                             if !prefill_success {
                                 panic!("Block allocation for prefilling cannot fail.");
                             }
+
+                            // Drain KV events and forward to relay after prefill signal processing
+                            if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) {
+                                while let Ok(event) = rx.try_recv() {
+                                    let _ = relay_tx.try_send(block_response_to_kv_event(event));
+                                }
+                            }
                         }
 
                         // Process decoding
@@ -341,6 +361,13 @@ impl Scheduler {
                                 continue;
                             }
 
+                            // Drain KV events and forward to relay after decode signal processing
+                            if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) {
+                                while let Ok(event) = rx.try_recv() {
+                                    let _ = relay_tx.try_send(block_response_to_kv_event(event));
+                                }
+                            }
+
                             // Send UUID notification for each generated token
                             if let Some(tx) = &output_tx_clone {
                                 let signal = OutputSignal {
@@ -485,7 +512,7 @@ fn process_signals(
         }
 
         // Check we have a Use signal with blocks
-        let MoveBlock::Use(blocks, _) = signal else {
+        let MoveBlock::Use(blocks) = signal else {
             panic!("Failed signal is Invalid. Has to fail on generation signal.");
         };
 
@@ -536,7 +563,7 @@ mod tests {
             .unwrap();
 
         // Create scheduler with new args struct
-        let scheduler = Scheduler::new(args, None, Some(output_tx), None);
+        let scheduler = Scheduler::new(args, None, Some(output_tx), None, None);
 
         // Create shared tokens for caching case
         let shared_tokens = if use_shared_tokens {
@@ -649,7 +676,7 @@ mod tests {
             .unwrap();
 
         // Create scheduler
-        let scheduler = Scheduler::new(args, None, Some(output_tx), None);
+        let scheduler = Scheduler::new(args, None, Some(output_tx), None, None);
 
         // Create identical tokens for all requests
         let identical_tokens: Vec<u32> = (0..token_length).map(|i| i as u32).collect();
diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index b79e79872a..17ef65d2c3 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -73,7 +73,7 @@ impl ActiveSequence {
 
         let tokens = Tokens::from(tokens).into_sequence(block_size, None);
         let unique_blocks = create_unique_blocks_from_sequence(&tokens, None, block_size);
-        let creation_signal = Some(MoveBlock::Use(unique_blocks.clone(), None));
+        let creation_signal = Some(MoveBlock::Use(unique_blocks.clone()));
 
         Self {
             unique_blocks,
@@ -109,6 +109,17 @@ impl ActiveSequence {
         (sequence, signal)
     }
 
+    /// Get the parent hash from the second-to-last block if it exists and is a FullBlock
+    fn get_parent_hash(&self) -> Option<u64> {
+        if self.unique_blocks.len() < 2 {
+            return None;
+        }
+        match &self.unique_blocks[self.unique_blocks.len() - 2] {
+            UniqueBlock::FullBlock(hash) => Some(*hash),
+            _ => panic!("Cannot have a partial block as parent"),
+        }
+    }
+
     /// Push a token to the sequence
     pub fn push(&mut self, token: u32) -> Option<Vec<MoveBlock>> {
         self.tokens.append(token).expect("Token push failed.");
@@ -128,12 +139,16 @@ impl ActiveSequence {
             self.unique_blocks.pop();
             self.unique_blocks
                 .push(UniqueBlock::FullBlock(last_block_hash));
-            signals.push(MoveBlock::Promote(uuid, last_block_hash));
+            signals.push(MoveBlock::Promote(
+                uuid,
+                last_block_hash,
+                self.get_parent_hash(),
+            ));
         }
 
         let new_partial_block = UniqueBlock::default();
         self.unique_blocks.push(new_partial_block.clone());
-        signals.push(MoveBlock::Use(vec![new_partial_block], None));
+        signals.push(MoveBlock::Use(vec![new_partial_block]));
         Some(signals)
     }
 
@@ -201,7 +216,7 @@ impl ActiveSequence {
         self.unique_blocks =
             create_unique_blocks_from_sequence(&self.tokens, None, self.block_size);
         self.generated_tokens = 0;
-        self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone(), None));
+        self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone()));
 
         free_signal
     }
@@ -233,7 +248,7 @@ mod tests {
         // Check that we got a Use signal
         assert!(signal1.is_some());
         match &signal1 {
-            Some(MoveBlock::Use(blocks, _)) => {
+            Some(MoveBlock::Use(blocks)) => {
                 assert_eq!(blocks.len(), 1);
             }
             _ => panic!("Expected Use signal"),
@@ -252,24 +267,23 @@ mod tests {
         let signal_16 = signal_16.unwrap();
         assert_eq!(signal_16.len(), 2);
 
+        // First signal should be Promote for the previous block
+        match &signal_16[0] {
+            MoveBlock::Promote(_, _, parent_hash) => {
+                assert_eq!(*parent_hash, None);
+            }
+            _ => panic!("Expected Promote signal as second signal"),
+        }
+
         // Second signal should be Use for new partial block
         match &signal_16[1] {
-            MoveBlock::Use(blocks, _) => {
+            MoveBlock::Use(blocks) => {
                 assert_eq!(blocks.len(), 1);
                 assert!(matches!(blocks[0], UniqueBlock::PartialBlock(_)));
             }
             _ => panic!("Expected Use signal as first signal"),
         }
 
-        // First signal should be Promote for the previous block
-        match &signal_16[0] {
-            MoveBlock::Promote(uuid, _) => {
-                // The uuid is generated dynamically, so we just check it exists
-                let _ = uuid;
-            }
-            _ => panic!("Expected Promote signal as second signal"),
-        }
-
         // Verify state after pushing tokens
         assert_eq!(seq1.unique_blocks().len(), 2); // One full block and one partial block
         assert_eq!(seq1.len(), 17);
@@ -339,6 +353,32 @@ mod tests {
             "First two blocks should be identical"
         );
 
+        // Push tokens 34..47 to seq1
+        for token in 33..48 {
+            seq1.push(token);
+        }
+
+        // Push token 47 and get the signal - this completes the block and triggers signals
+        let signal = seq1.push(48);
+        let signal = signal.unwrap();
+
+        // Check that signal[0] is promote
+        match &signal[0] {
+            MoveBlock::Promote(_, _, parent_hash) => {
+                // Check that the parent_hash matches unique_blocks[1], which should be a full block
+                if let UniqueBlock::FullBlock(expected_hash) = seq1.unique_blocks()[1] {
+                    assert_eq!(
+                        *parent_hash,
+                        Some(expected_hash),
+                        "Parent hash should match unique_blocks[1]"
+                    );
+                } else {
+                    panic!("unique_blocks[1] should be a full block");
+                }
+            }
+            _ => panic!("Expected Promote signal as first signal"),
+        }
+
         // Reset seq1 and check that it equals the original clone
         let free_signals = seq1.reset_with_signal();
 
@@ -355,7 +395,7 @@ mod tests {
         // Initial signal - should have received a Use signal for the partial block
         assert!(signal.is_some());
         match signal {
-            Some(MoveBlock::Use(blocks, _)) => {
+            Some(MoveBlock::Use(blocks)) => {
                 assert_eq!(blocks.len(), 1);
                 assert!(matches!(blocks[0], UniqueBlock::PartialBlock(_)));
             }
@@ -371,25 +411,23 @@ mod tests {
         let signals_second = seq.generate();
         assert_eq!(signals_second.len(), 2);
 
-        // First signal should be Use for new partial block
+        // First signal should be Promote
+        match &signals_second[0] {
+            MoveBlock::Promote(_, _, parent_hash) => {
+                assert_eq!(*parent_hash, None);
+            }
+            _ => panic!("Expected Promote signal as first signal after second token"),
+        }
+
+        // Second signal should be Use for new partial block
         match &signals_second[1] {
-            MoveBlock::Use(blocks, _) => {
+            MoveBlock::Use(blocks) => {
                 assert_eq!(blocks.len(), 1);
                 assert!(matches!(blocks[0], UniqueBlock::PartialBlock(_)));
             }
             _ => panic!("Expected Use signal as second signal after second token"),
         }
 
-        // Second signal should be Promote
-        match &signals_second[0] {
-            MoveBlock::Promote(uuid, hash) => {
-                // The uuid and hash values are generated dynamically, so we just check the event type
-                let _ = uuid;
-                let _ = hash;
-            }
-            _ => panic!("Expected Promote signal as first signal after second token"),
-        }
-
         // Generate fourth token - should not trigger new signals as it's adding to partial block
         let signals_third = seq.generate();
         assert_eq!(signals_third.len(), 0);

From e96f8103c037d7797899cb38773236b0e1432133 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Fri, 13 Jun 2025 02:59:31 -0700
Subject: [PATCH 10/36] move block resp test in kv manager

---
 lib/llm/src/mocker/kv_manager.rs | 91 ++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 4 deletions(-)

diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index 37721db04e..9d061b8ed0 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -122,11 +122,13 @@ impl KvManager {
             MoveBlock::Use(hashes) => {
                 let mut blocks_stored = Vec::<u64>::new();
 
+                let mut parent_block: Option<&UniqueBlock> = None;
                 for hash in hashes {
                     // First check if it already exists in active blocks
                     if let Some(ref_count) = self.active_blocks.get_mut(hash) {
                         // Block already active, just increment reference count
                         *ref_count += 1;
+                        parent_block = Some(hash);
                         continue;
                     }
 
@@ -134,6 +136,7 @@ impl KvManager {
                     if self.inactive_blocks.remove(hash) {
                         // Insert into active with reference count 1
                         self.active_blocks.insert(hash.clone(), 1);
+                        parent_block = Some(hash);
                         continue;
                     }
 
@@ -161,7 +164,13 @@ impl KvManager {
                         }
                     }
                 }
-                self.send_block_response(blocks_stored, false, true, None);
+
+                let parent_hash = match parent_block {
+                    None => None,
+                    Some(UniqueBlock::FullBlock(block)) => Some(*block),
+                    Some(UniqueBlock::PartialBlock(_)) => panic!("parent block cannot be partial"),
+                };
+                self.send_block_response(blocks_stored, false, true, parent_hash);
             }
 
             MoveBlock::Destroy(hashes) => {
@@ -299,6 +308,7 @@ impl KvManager {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use tokio::sync::mpsc;
 
     #[test]
     fn test_failure_on_max_capacity() {
@@ -327,10 +337,12 @@ mod tests {
     }
 
     #[test]
-    // This is taken directly from the example in the vllm v1 prefix caching docs
     fn test_block_lifecycle_stringent() {
-        // Create a KvManager with 10 blocks capacity
-        let mut manager = KvManager::new(10, 16);
+        // Create a channel to listen to block responses
+        let (tx, mut rx) = mpsc::unbounded_channel::<MoveBlockResponse>();
+
+        // Create a KvManager with 10 blocks capacity and the response sender
+        let mut manager = KvManager::new_with_sender(10, 16, Some(tx));
 
         // Helper function to use multiple blocks
         fn use_blocks(manager: &mut KvManager, ids: Vec<u64>) {
@@ -350,6 +362,65 @@ mod tests {
             manager.process(&MoveBlock::Deref(blocks));
         }
 
+        // Helper function to assert block responses
+        fn assert_block_response(
+            rx: &mut mpsc::UnboundedReceiver<MoveBlockResponse>,
+            expected_type: &str,
+            expected_blocks: Vec<u64>,
+            description: &str,
+        ) {
+            let response = rx
+                .try_recv()
+                .unwrap_or_else(|_| panic!("Expected {} response {}", expected_type, description));
+
+            match (&response, expected_type) {
+                (MoveBlockResponse::Store(blocks, _parent_hash), "Store") => {
+                    assert_eq!(
+                        blocks.len(),
+                        expected_blocks.len(),
+                        "Expected {} blocks in Store response {}",
+                        expected_blocks.len(),
+                        description
+                    );
+                    assert_eq!(
+                        *blocks, expected_blocks,
+                        "Store blocks don't match expected {}",
+                        description
+                    );
+                }
+                (MoveBlockResponse::Remove(blocks), "Remove") => {
+                    assert_eq!(
+                        blocks.len(),
+                        expected_blocks.len(),
+                        "Expected {} blocks in Remove response {}",
+                        expected_blocks.len(),
+                        description
+                    );
+                    assert_eq!(
+                        *blocks, expected_blocks,
+                        "Remove blocks don't match expected {}",
+                        description
+                    );
+                }
+                _ => panic!(
+                    "Expected {} response, got {:?} {}",
+                    expected_type, response, description
+                ),
+            }
+        }
+
+        // Helper function to assert no response is received
+        fn assert_no_response(
+            rx: &mut mpsc::UnboundedReceiver<MoveBlockResponse>,
+            description: &str,
+        ) {
+            assert!(
+                rx.try_recv().is_err(),
+                "Expected no response {}",
+                description
+            );
+        }
+
         // Helper function to check if active blocks contain expected blocks with expected ref counts
         fn assert_active_blocks(manager: &KvManager, expected_blocks: &[(u64, usize)]) {
             assert_eq!(
@@ -400,9 +471,11 @@ mod tests {
 
         // First use blocks 0, 1, 2, 3, 4 in a batch
         use_blocks(&mut manager, (0..5).collect());
+        assert_block_response(&mut rx, "Store", vec![0, 1, 2, 3, 4], "after first use");
 
         // Then use blocks 0, 1, 5, 6 in a batch
         use_blocks(&mut manager, vec![0, 1, 5, 6]);
+        assert_block_response(&mut rx, "Store", vec![5, 6], "after second use");
 
         // Check that the blocks 0 and 1 are in active blocks, both with reference counts of 2
         assert_active_blocks(
@@ -412,9 +485,11 @@ mod tests {
 
         // Now destroy block 4
         destroy_blocks(&mut manager, vec![4]);
+        assert_block_response(&mut rx, "Remove", vec![4], "after destroy block 4");
 
         // And deref blocks 3, 2, 1, 0 in this order as a batch
         deref_blocks(&mut manager, vec![0, 1, 2, 3]);
+        assert_no_response(&mut rx, "after deref operation");
 
         // Check that the inactive_blocks is size 2 (via num_objects) and contains 3 and 2
         assert_inactive_blocks(&manager, 2, &[3, 2]);
@@ -422,6 +497,7 @@ mod tests {
 
         // Now destroy block 6
         destroy_blocks(&mut manager, vec![6]);
+        assert_block_response(&mut rx, "Remove", vec![6], "after block 6 eviction");
 
         // And deref blocks 5, 1, 0 as a batch
         deref_blocks(&mut manager, vec![0, 1, 5]);
@@ -432,6 +508,7 @@ mod tests {
 
         // Now use 0, 1, 2, 7, 8, 9 as a batch
         use_blocks(&mut manager, vec![0, 1, 2, 7, 8, 9]);
+        assert_block_response(&mut rx, "Store", vec![7, 8, 9], "after [7, 8, 9] use");
 
         // Check that the inactive_blocks is size 2, and contains 3 and 5
         assert_inactive_blocks(&manager, 2, &[3, 5]);
@@ -446,8 +523,14 @@ mod tests {
 
         // Now use blocks 10, 11, 12 as a batch
         use_blocks(&mut manager, vec![10, 11, 12]);
+        assert_block_response(&mut rx, "Remove", vec![3], "after block 5 eviction");
+        assert_block_response(&mut rx, "Store", vec![10, 11, 12], "after [10, 11, 12] use");
 
         // Check that the inactive_blocks is size 1 and contains only 5
         assert_inactive_blocks(&manager, 1, &[5]);
+
+        use_blocks(&mut manager, vec![13]);
+        assert_block_response(&mut rx, "Remove", vec![5], "after block 5 eviction");
+        assert_block_response(&mut rx, "Store", vec![13], "after block 13 use");
     }
 }

From c09f007067ded64c62bf7d68b37693b35832e38b Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 14 Jun 2025 02:50:20 -0700
Subject: [PATCH 11/36] basic test passes for both load metrics and kv events

---
 lib/llm/src/mocker/engine.rs    | 368 +++++++++++++++++++++++++++-----
 lib/llm/src/mocker/scheduler.rs |  22 +-
 2 files changed, 324 insertions(+), 66 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 914a6c1ad0..ca6eb968de 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -28,9 +28,12 @@ use dynamo_runtime::{
     engine::AsyncEngineContextProvider,
     pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn},
     protocols::annotated::Annotated,
+    traits::DistributedRuntimeProvider,
     Result,
 };
 
+use crate::kv_router::protocols::{KvCacheEvent, KvCacheEventData};
+use crate::kv_router::publisher::KvEventPublisher;
 use rand::Rng;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -72,12 +75,21 @@ impl MockVllmEngine {
 
         let cancel_token = cancel_token.unwrap_or_default();
 
-        // Create schedulers and start their background tasks
-        let schedulers =
+        // Create schedulers and get their KV event receivers
+        let (schedulers, kv_event_receivers) =
             Self::start_schedulers(args.clone(), active_requests.clone(), cancel_token.clone());
 
-        // Start metrics publishing tasks
-        Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?;
+        Self::start_metrics_publishing(&schedulers, component.clone(), cancel_token.clone())
+            .await?;
+
+        // Start KV events publishing with the actual receivers from schedulers
+        Self::start_kv_events_publishing(
+            kv_event_receivers,
+            component.clone(),
+            args.block_size,
+            cancel_token.clone(),
+        )
+        .await?;
 
         let engine = Self {
             schedulers,
@@ -90,27 +102,36 @@ impl MockVllmEngine {
     }
 
     /// Create schedulers and spawn their background tasks for distributing token notifications
+    /// Returns schedulers and their corresponding KV event receivers
     fn start_schedulers(
         args: MockEngineArgs,
         active_requests: Arc<Mutex<HashMap<Uuid, mpsc::Sender<OutputSignal>>>>,
         cancel_token: CancellationToken,
-    ) -> Vec<Scheduler> {
+    ) -> (
+        Vec<Scheduler>,
+        Vec<mpsc::UnboundedReceiver<KvCacheEventData>>,
+    ) {
         let mut schedulers = Vec::new();
+        let mut kv_event_receivers = Vec::new();
 
         // Create multiple schedulers and their background tasks
         for dp_rank in 0..args.dp_size {
             // Create a shared output channel that this scheduler will use
-            let (output_tx, output_rx) = mpsc::channel::<OutputSignal>(1024);
+            let (output_tx, output_rx) = mpsc::unbounded_channel::<OutputSignal>();
+
+            // Create a channel for KV events from this scheduler
+            let (kv_events_tx, kv_events_rx) = mpsc::unbounded_channel::<KvCacheEventData>();
 
             let scheduler = Scheduler::new(
                 args.clone(),
                 Some(dp_rank),
                 Some(output_tx),
-                None,
+                Some(kv_events_tx), // Pass the KV events sender to scheduler
                 Some(cancel_token.clone()),
             );
 
             schedulers.push(scheduler);
+            kv_event_receivers.push(kv_events_rx);
 
             // Spawn a background task for this scheduler to distribute token notifications to active requests
             let output_rx = Arc::new(Mutex::new(output_rx));
@@ -142,7 +163,7 @@ impl MockVllmEngine {
             });
         }
 
-        schedulers
+        (schedulers, kv_event_receivers)
     }
 
     /// Start background tasks to poll and publish metrics every second
@@ -151,12 +172,27 @@ impl MockVllmEngine {
         component: Option<Component>,
         cancel_token: CancellationToken,
     ) -> Result<()> {
+        println!("🔧 Creating metrics publisher...");
         let metrics_publisher = Arc::new(WorkerMetricsPublisher::new()?);
+        println!("✓ Metrics publisher created");
 
         if let Some(comp) = component {
-            metrics_publisher.create_endpoint(comp).await?;
+            println!("🔧 Creating metrics endpoint...");
+            tokio::spawn({
+                let publisher = metrics_publisher.clone();
+                async move {
+                    if let Err(e) = publisher.create_endpoint(comp.clone()).await {
+                        println!("Metrics endpoint failed: {}", e);
+                    }
+                }
+            });
+
+            // Give it a moment to start
+            tokio::time::sleep(Duration::from_millis(100)).await;
+            println!("✓ Metrics endpoint started (background)");
         }
 
+        println!("🔧 Starting metrics background tasks...");
         for (dp_rank, scheduler) in schedulers.iter().enumerate() {
             let scheduler = scheduler.clone();
             let publisher = metrics_publisher.clone();
@@ -164,7 +200,7 @@ impl MockVllmEngine {
             let cancel_token = cancel_token.clone();
 
             tokio::spawn(async move {
-                let mut interval = interval(Duration::from_secs(1));
+                let mut interval = interval(Duration::from_millis(100));
 
                 loop {
                     tokio::select! {
@@ -187,6 +223,82 @@ impl MockVllmEngine {
                 }
             });
         }
+        println!("✓ Metrics background tasks started");
+        Ok(())
+    }
+
+    /// Start background tasks to collect and publish KV events from schedulers
+    async fn start_kv_events_publishing(
+        kv_event_receivers: Vec<mpsc::UnboundedReceiver<KvCacheEventData>>,
+        component: Option<Component>,
+        block_size: usize,
+        cancel_token: CancellationToken,
+    ) -> Result<()> {
+        println!("🔧 Starting KV events publishing...");
+
+        // Only start KV events publishing if we have a component
+        let Some(comp) = component else {
+            println!("⚠️ No component provided, skipping KV events publishing");
+            return Ok(());
+        };
+        println!("✓ Component found for KV events publishing");
+
+        println!("🔧 Getting worker_id...");
+        let worker_id = comp
+            .drt()
+            .primary_lease()
+            .expect("Cannot publish KV events without lease") // ← This will PANIC on static!
+            .id();
+        // let worker_id = 0;
+        println!("✓ Worker_id set to: {}", worker_id);
+
+        println!("🔧 Creating KV event publisher...");
+        let kv_event_publisher = Arc::new(KvEventPublisher::new(
+            comp.clone(),
+            worker_id,
+            block_size,
+            None,
+        )?);
+        println!("✓ KV event publisher created");
+
+        println!(
+            "🔧 Starting KV event background tasks for {} receivers...",
+            kv_event_receivers.len()
+        );
+        for (dp_rank, mut kv_events_rx) in kv_event_receivers.into_iter().enumerate() {
+            println!("🔧 Starting background task for DP rank {}", dp_rank);
+            let publisher = kv_event_publisher.clone();
+            let dp_rank = dp_rank as u32;
+            let cancel_token = cancel_token.clone();
+
+            tokio::spawn(async move {
+                println!("✓ Background task started for DP rank {}", dp_rank);
+                loop {
+                    tokio::select! {
+                        // Receive actual KV events from the scheduler
+                        Some(event_data) = kv_events_rx.recv() => {
+                            // Convert KvCacheEventData to KvCacheEvent with random UUID as event_id
+                            let event = KvCacheEvent {
+                                event_id: Uuid::new_v4().as_u128() as u64,
+                                data: event_data,
+                            };
+
+                            // Publish the event
+                            if let Err(e) = publisher.publish(event) {
+                                tracing::warn!("Failed to publish KV event for DP rank {}: {}", dp_rank, e);
+                            } else {
+                                tracing::trace!("Published KV event for DP rank {}", dp_rank);
+                            }
+                        }
+                        _ = cancel_token.cancelled() => {
+                            tracing::info!("KV events publishing cancelled for DP rank {}", dp_rank);
+                            break;
+                        }
+                    }
+                }
+            });
+        }
+        println!("✓ All KV event background tasks started");
 
         Ok(())
     }
@@ -267,77 +379,223 @@ impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for
 }
 
 #[cfg(test)]
-mod tests {
+mod integration_tests {
     use super::*;
-    use dynamo_runtime::pipeline::Context;
+    use crate::kv_router::indexer::RouterEvent;
+    use crate::kv_router::KV_EVENT_SUBJECT;
+    use dynamo_runtime::{
+        pipeline::Context,
+        pipeline::{network::Ingress, PushRouter},
+        traits::events::EventSubscriber,
+        DistributedRuntime, Worker,
+    };
     use futures::StreamExt;
+    use tokio::time::timeout;
 
     #[tokio::test]
-    async fn test_multiple_workers_with_token_limit() {
+    #[ignore] // Run with: cargo test -- --ignored
+    async fn test_mock_vllm_engine_full_integration() -> Result<()> {
         const DP_SIZE: u32 = 2;
         const TOKENS_PER_REQUEST: usize = 20;
-
-        // Create the MockVllmEngine using builder pattern
+        const BLOCK_SIZE: usize = 2;
+
+        // Create runtime and distributed runtime
+        let worker = Worker::from_settings()?;
+        let runtime = worker.runtime();
+        let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
+        println!("✓ Runtime and distributed runtime created");
+
+        // Create component for MockVllmEngine (needed for publishers)
+        let test_component = distributed
+            .namespace("test")?
+            .component("mock-vllm")?
+            .service_builder()
+            .create()
+            .await?;
+        println!("✓ Test component created");
+
+        // Create MockVllmEngine WITH component (enables publishers)
         let args = MockEngineArgs::builder()
             .speedup_ratio(10.0)
             .dp_size(DP_SIZE)
+            .block_size(BLOCK_SIZE)
             .build()
             .unwrap();
 
-        let engine = MockVllmEngine::new(args, None, None).await.unwrap();
+        let engine = Arc::new(MockVllmEngine::new(args, Some(test_component.clone()), None).await?);
+        println!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE);
+
+        // Set up KV events subscriber
+        let mut kv_events_subscriber = test_component.subscribe(KV_EVENT_SUBJECT).await?;
+        println!("✓ KV events subscriber created");
+
+        // Wrap with Ingress and register with component/endpoint
+        let ingress = Ingress::for_engine(engine)?;
+        println!("✓ Ingress wrapper created");
+
+        // Start the server in background
+        let server_handle = tokio::spawn({
+            let test_component = test_component.clone();
+            async move {
+                if let Err(e) = test_component
+                    .endpoint("generate")
+                    .endpoint_builder()
+                    .handler(ingress)
+                    .start()
+                    .await
+                {
+                    eprintln!("❌ Generate endpoint failed: {}", e);
+                }
+            }
+        });
+        println!("✓ Server started in background");
+
+        // Give server time to start
+        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+        println!("✓ Server startup delay completed");
+
+        // Print all registered instances from etcd
+        match test_component.list_instances().await {
+            Ok(instances) => {
+                println!("📋 Found {} registered instances:", instances.len());
+                for instance in instances {
+                    println!(
+                        "  • {}/{}/{} (ID: {})",
+                        instance.namespace,
+                        instance.component,
+                        instance.endpoint,
+                        instance.instance_id
+                    );
+                }
+            }
+            Err(e) => {
+                println!("❌ Failed to list instances: {}", e);
+            }
+        }
+
+        // Create client
+        let client = distributed
+            .namespace("test")?
+            .component("mock-vllm")?
+            .endpoint("generate")
+            .client()
+            .await?;
+        println!("✓ Client created");
+
+        let router = PushRouter::from_client(client, Default::default()).await?;
+        println!("✓ Router created");
+
+        // Create test requests for both DP workers
+        let create_request = |tokens: Vec<u32>, dp_rank: u32| DirectRequest {
+            tokens,
+            max_output_tokens: TOKENS_PER_REQUEST,
+            uuid: None,
+            dp_rank: Some(dp_rank),
+        };
 
-        // Create 4 DirectRequests: 2 for worker 0, 2 for worker 1
         let requests = vec![
-            DirectRequest {
-                tokens: vec![1, 2, 3, 4],
-                max_output_tokens: TOKENS_PER_REQUEST,
-                uuid: None,
-                dp_rank: Some(0),
-            },
-            DirectRequest {
-                tokens: vec![5, 6, 7, 8],
-                max_output_tokens: TOKENS_PER_REQUEST,
-                uuid: None,
-                dp_rank: Some(0),
-            },
-            DirectRequest {
-                tokens: vec![9, 10, 11, 12],
-                max_output_tokens: TOKENS_PER_REQUEST,
-                uuid: None,
-                dp_rank: Some(1),
-            },
-            DirectRequest {
-                tokens: vec![13, 14, 15, 16],
-                max_output_tokens: TOKENS_PER_REQUEST,
-                uuid: None,
-                dp_rank: Some(1),
-            },
+            create_request(vec![1, 2, 3, 4, 5], 0),
+            create_request(vec![1, 2, 3, 4, 5], 0),
+            create_request(vec![1, 2, 3, 4, 5], 1),
+            create_request(vec![1, 2, 3, 4, 5], 1),
         ];
+        println!(
+            "✓ Test requests created ({} requests total)",
+            requests.len()
+        );
 
-        // Generate streams and collect all tokens from each
-        for request in requests {
-            let ctx = Context::new(request);
-            let stream = engine.generate(ctx).await.unwrap();
+        // Test each request
+        for (i, request) in requests.into_iter().enumerate() {
+            println!("Testing request {}", i + 1);
 
-            let tokens: Vec<_> = stream.collect().await;
+            let response_stream = router.generate(Context::new(request)).await?;
+            let responses: Vec<Annotated<String>> = response_stream.collect().await;
 
             // Verify each stream produces exactly the expected number of tokens
-            assert_eq!(tokens.len(), TOKENS_PER_REQUEST);
+            assert_eq!(
+                responses.len(),
+                TOKENS_PER_REQUEST,
+                "Request {} should produce {} tokens, got {}",
+                i + 1,
+                TOKENS_PER_REQUEST,
+                responses.len()
+            );
 
             // Verify all tokens contain valid data
-            for token in tokens {
-                assert!(token.data.is_some());
+            for (j, token) in responses.iter().enumerate() {
+                if let Some(char_data) = &token.data {
+                    assert!(
+                        !char_data.is_empty(),
+                        "Request {} token {} should not be empty",
+                        i + 1,
+                        j + 1
+                    );
+                } else {
+                    panic!("Request {} token {} should have data", i + 1, j + 1);
+                }
+            }
+
+            println!(
+                "✓ Request {} completed successfully with {} tokens",
+                i + 1,
+                responses.len()
+            );
+        }
+
+        println!("🎉 All requests completed successfully!");
+
+        // Try to receive at least one KV event with 100ms timeout
+        println!("Waiting for KV event with 100ms timeout...");
+        let msg = timeout(Duration::from_millis(100), kv_events_subscriber.next())
+            .await
+            .map_err(|_| Error::msg("Timeout waiting for KV event"))?
+            .ok_or_else(|| Error::msg("KV events stream ended unexpectedly"))?;
+
+        match serde_json::from_slice::<RouterEvent>(&msg.payload) {
+            Ok(event) => {
+                println!("✓ Received KV event: {:?}", event);
+            }
+            Err(e) => {
+                return Err(Error::msg(format!("Failed to deserialize KV event: {}", e)));
             }
         }
 
-        // Give a small delay to ensure cleanup tasks complete
-        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+        // Use KvMetricsAggregator to get metrics more easily
+        let cancel_token = test_component.drt().runtime().child_token();
+        let metrics_aggregator = crate::kv_router::metrics_aggregator::KvMetricsAggregator::new(
+            test_component.clone(),
+            cancel_token,
+        )
+        .await;
+        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+
+        let processed_endpoints = metrics_aggregator.get_endpoints();
+        println!(
+            "Found {} metrics endpoints",
+            processed_endpoints.endpoints.len()
+        );
 
-        // Verify that active_requests is empty (all requests cleaned up)
-        let active_requests = engine.active_requests.lock().await;
+        // Verify we found at least one metrics endpoint
         assert!(
-            active_requests.is_empty(),
-            "Active requests should be empty after streams complete"
+            !processed_endpoints.endpoints.is_empty(),
+            "Should find at least one metrics endpoint"
+        );
+        println!(
+            "✓ Successfully found {} metrics endpoints",
+            processed_endpoints.endpoints.len()
         );
+
+        // Verify the metrics endpoints contain valid data
+        for (worker_id, endpoint) in &processed_endpoints.endpoints {
+            println!("✓ Worker {} metrics: {:?}", worker_id, endpoint.data);
+        }
+
+        println!("🎉 Event verification completed!");
+
+        // Cleanup
+        distributed.shutdown();
+        server_handle.await?;
+
+        Ok(())
     }
 }
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index a4ca6b830e..255e2380fb 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -196,7 +196,7 @@ pub struct Scheduler {
     dp_rank: Option<u32>,
     state: Arc<Mutex<SchedulerState>>,
     kv_manager: Arc<Mutex<KvManager>>,
-    request_tx: mpsc::Sender<DirectRequest>,
+    request_tx: mpsc::UnboundedSender<DirectRequest>,
     hit_rates: Arc<Mutex<VecDeque<f32>>>,
 }
 
@@ -205,8 +205,8 @@ impl Scheduler {
     pub fn new(
         args: MockEngineArgs,
         dp_rank: Option<u32>,
-        output_tx: Option<mpsc::Sender<OutputSignal>>,
-        kv_events_tx: Option<mpsc::Sender<KvCacheEventData>>,
+        output_tx: Option<mpsc::UnboundedSender<OutputSignal>>,
+        kv_events_tx: Option<mpsc::UnboundedSender<KvCacheEventData>>,
         cancellation_token: Option<CancellationToken>,
     ) -> Self {
         let state = Arc::new(Mutex::new(SchedulerState::default()));
@@ -234,7 +234,7 @@ impl Scheduler {
         );
 
         // Create channel for request handling
-        let (request_tx, mut request_rx) = mpsc::channel::<DirectRequest>(1024);
+        let (request_tx, mut request_rx) = mpsc::unbounded_channel::<DirectRequest>();
 
         // Create a clone for the background task
         let state_clone = state.clone();
@@ -337,7 +337,7 @@ impl Scheduler {
                             // Drain KV events and forward to relay after prefill signal processing
                             if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) {
                                 while let Ok(event) = rx.try_recv() {
-                                    let _ = relay_tx.try_send(block_response_to_kv_event(event));
+                                    let _ = relay_tx.send(block_response_to_kv_event(event));
                                 }
                             }
                         }
@@ -364,7 +364,7 @@ impl Scheduler {
                             // Drain KV events and forward to relay after decode signal processing
                             if let (Some(ref relay_tx), Some(ref mut rx)) = (&kv_events_tx, &mut block_resp_rx) {
                                 while let Ok(event) = rx.try_recv() {
-                                    let _ = relay_tx.try_send(block_response_to_kv_event(event));
+                                    let _ = relay_tx.send(block_response_to_kv_event(event));
                                 }
                             }
 
@@ -374,7 +374,7 @@ impl Scheduler {
                                     uuid,
                                     completed: false,
                                 };
-                                let _ = tx.try_send(signal);
+                                let _ = tx.send(signal);
                             }
 
                             // Check if we're done after generating
@@ -384,7 +384,7 @@ impl Scheduler {
                                         uuid,
                                         completed: true,
                                     };
-                                    let _ = tx.try_send(signal);
+                                    let _ = tx.send(signal);
                                 }
                                 state_guard.complete(&uuid);
                                 continue;
@@ -414,7 +414,7 @@ impl Scheduler {
 
     /// Add a new request to the waiting queue
     pub async fn receive(&self, request: DirectRequest) {
-        let _ = self.request_tx.send(request).await;
+        let _ = self.request_tx.send(request);
     }
 
     /// Get the count of waiting requests
@@ -552,7 +552,7 @@ mod tests {
         let max_output_tokens: usize = 100;
 
         // Create channel for token output
-        let (output_tx, mut output_rx) = mpsc::channel::<OutputSignal>(1024);
+        let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
 
         // Create scheduler args using builder
         let args = MockEngineArgs::builder()
@@ -665,7 +665,7 @@ mod tests {
         let token_length = 65;
 
         // Create channel for token output
-        let (output_tx, mut output_rx) = mpsc::channel::<OutputSignal>(1024);
+        let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
 
         // Create scheduler args
         let args = MockEngineArgs::builder()

From 4502e5e6a364b067af957965513420c7f2514f6f Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sat, 14 Jun 2025 02:56:41 -0700
Subject: [PATCH 12/36] better tracing

---
 lib/llm/src/mocker/engine.rs | 82 ++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index ca6eb968de..34f137d661 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -172,27 +172,27 @@ impl MockVllmEngine {
         component: Option<Component>,
         cancel_token: CancellationToken,
     ) -> Result<()> {
-        println!("🔧 Creating metrics publisher...");
+        tracing::info!("Creating metrics publisher");
         let metrics_publisher = Arc::new(WorkerMetricsPublisher::new()?);
-        println!("✓ Metrics publisher created");
+        tracing::info!("Metrics publisher created");
 
         if let Some(comp) = component {
-            println!("🔧 Creating metrics endpoint...");
+            tracing::info!("Creating metrics endpoint");
             tokio::spawn({
                 let publisher = metrics_publisher.clone();
                 async move {
                     if let Err(e) = publisher.create_endpoint(comp.clone()).await {
-                        println!("Metrics endpoint failed: {}", e);
+                        tracing::error!("Metrics endpoint failed: {}", e);
                     }
                 }
             });
 
             // Give it a moment to start
             tokio::time::sleep(Duration::from_millis(100)).await;
-            println!("✓ Metrics endpoint started (background)");
+            tracing::info!("Metrics endpoint started (background)");
         }
 
-        println!("🔧 Starting metrics background tasks...");
+        tracing::info!("Starting metrics background tasks");
         for (dp_rank, scheduler) in schedulers.iter().enumerate() {
             let scheduler = scheduler.clone();
             let publisher = metrics_publisher.clone();
@@ -223,7 +223,7 @@ impl MockVllmEngine {
                 }
             });
         }
-        println!("✓ Metrics background tasks started");
+        tracing::info!("Metrics background tasks started");
         Ok(())
     }
 
@@ -234,45 +234,45 @@ impl MockVllmEngine {
         block_size: usize,
         cancel_token: CancellationToken,
     ) -> Result<()> {
-        println!("🔧 Starting KV events publishing...");
+        tracing::info!("Starting KV events publishing");
 
         // Only start KV events publishing if we have a component
         let Some(comp) = component else {
-            println!("⚠️ No component provided, skipping KV events publishing");
+            tracing::warn!("No component provided, skipping KV events publishing");
             return Ok(());
         };
-        println!("✓ Component found for KV events publishing");
+        tracing::info!("Component found for KV events publishing");
 
-        println!("🔧 Getting worker_id...");
+        tracing::debug!("Getting worker_id");
         let worker_id = comp
             .drt()
             .primary_lease()
             .expect("Cannot publish KV events without lease") // ← This will PANIC on static!
             .id();
         // let worker_id = 0;
-        println!("✓ Worker_id set to: {}", worker_id);
+        tracing::debug!("Worker_id set to: {}", worker_id);
 
-        println!("🔧 Creating KV event publisher...");
+        tracing::info!("Creating KV event publisher");
         let kv_event_publisher = Arc::new(KvEventPublisher::new(
             comp.clone(),
             worker_id,
             block_size,
             None,
         )?);
-        println!("✓ KV event publisher created");
+        tracing::info!("KV event publisher created");
 
-        println!(
-            "🔧 Starting KV event background tasks for {} receivers...",
+        tracing::info!(
+            "Starting KV event background tasks for {} receivers",
             kv_event_receivers.len()
         );
         for (dp_rank, mut kv_events_rx) in kv_event_receivers.into_iter().enumerate() {
-            println!("🔧 Starting background task for DP rank {}", dp_rank);
+            tracing::debug!("Starting background task for DP rank {}", dp_rank);
             let publisher = kv_event_publisher.clone();
             let dp_rank = dp_rank as u32;
             let cancel_token = cancel_token.clone();
 
             tokio::spawn(async move {
-                println!("✓ Background task started for DP rank {}", dp_rank);
+                tracing::debug!("Background task started for DP rank {}", dp_rank);
                 loop {
                     tokio::select! {
                         // Receive actual KV events from the scheduler
@@ -298,7 +298,7 @@ impl MockVllmEngine {
                 }
             });
         }
-        println!("✓ All KV event background tasks started");
+        tracing::info!("All KV event background tasks started");
 
         Ok(())
     }
@@ -403,7 +403,7 @@ mod integration_tests {
         let worker = Worker::from_settings()?;
         let runtime = worker.runtime();
         let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
-        println!("✓ Runtime and distributed runtime created");
+        tracing::info!("✓ Runtime and distributed runtime created");
 
         // Create component for MockVllmEngine (needed for publishers)
         let test_component = distributed
@@ -412,7 +412,7 @@ mod integration_tests {
             .service_builder()
             .create()
             .await?;
-        println!("✓ Test component created");
+        tracing::info!("✓ Test component created");
 
         // Create MockVllmEngine WITH component (enables publishers)
         let args = MockEngineArgs::builder()
@@ -423,15 +423,15 @@ mod integration_tests {
             .unwrap();
 
         let engine = Arc::new(MockVllmEngine::new(args, Some(test_component.clone()), None).await?);
-        println!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE);
+        tracing::info!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE);
 
         // Set up KV events subscriber
         let mut kv_events_subscriber = test_component.subscribe(KV_EVENT_SUBJECT).await?;
-        println!("✓ KV events subscriber created");
+        tracing::info!("✓ KV events subscriber created");
 
         // Wrap with Ingress and register with component/endpoint
         let ingress = Ingress::for_engine(engine)?;
-        println!("✓ Ingress wrapper created");
+        tracing::info!("✓ Ingress wrapper created");
 
         // Start the server in background
         let server_handle = tokio::spawn({
@@ -448,18 +448,18 @@ mod integration_tests {
                 }
             }
         });
-        println!("✓ Server started in background");
+        tracing::info!("✓ Server started in background");
 
         // Give server time to start
         tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
-        println!("✓ Server startup delay completed");
+        tracing::info!("✓ Server startup delay completed");
 
         // Print all registered instances from etcd
         match test_component.list_instances().await {
             Ok(instances) => {
-                println!("📋 Found {} registered instances:", instances.len());
+                tracing::info!("📋 Found {} registered instances:", instances.len());
                 for instance in instances {
-                    println!(
+                    tracing::info!(
                         "  • {}/{}/{} (ID: {})",
                         instance.namespace,
                         instance.component,
@@ -469,7 +469,7 @@ mod integration_tests {
                 }
             }
             Err(e) => {
-                println!("❌ Failed to list instances: {}", e);
+                tracing::error!("❌ Failed to list instances: {}", e);
             }
         }
 
@@ -480,10 +480,10 @@ mod integration_tests {
             .endpoint("generate")
             .client()
             .await?;
-        println!("✓ Client created");
+        tracing::info!("✓ Client created");
 
         let router = PushRouter::from_client(client, Default::default()).await?;
-        println!("✓ Router created");
+        tracing::info!("✓ Router created");
 
         // Create test requests for both DP workers
         let create_request = |tokens: Vec<u32>, dp_rank: u32| DirectRequest {
@@ -499,14 +499,14 @@ mod integration_tests {
             create_request(vec![1, 2, 3, 4, 5], 1),
             create_request(vec![1, 2, 3, 4, 5], 1),
         ];
-        println!(
+        tracing::info!(
             "✓ Test requests created ({} requests total)",
             requests.len()
         );
 
         // Test each request
         for (i, request) in requests.into_iter().enumerate() {
-            println!("Testing request {}", i + 1);
+            tracing::info!("Testing request {}", i + 1);
 
             let response_stream = router.generate(Context::new(request)).await?;
             let responses: Vec<Annotated<String>> = response_stream.collect().await;
@@ -535,17 +535,17 @@ mod integration_tests {
                 }
             }
 
-            println!(
+            tracing::info!(
                 "✓ Request {} completed successfully with {} tokens",
                 i + 1,
                 responses.len()
             );
         }
 
-        println!("🎉 All requests completed successfully!");
+        tracing::info!("🎉 All requests completed successfully!");
 
         // Try to receive at least one KV event with 100ms timeout
-        println!("Waiting for KV event with 100ms timeout...");
+        tracing::info!("Waiting for KV event with 100ms timeout...");
         let msg = timeout(Duration::from_millis(100), kv_events_subscriber.next())
             .await
             .map_err(|_| Error::msg("Timeout waiting for KV event"))?
@@ -553,7 +553,7 @@ mod integration_tests {
 
         match serde_json::from_slice::<RouterEvent>(&msg.payload) {
             Ok(event) => {
-                println!("✓ Received KV event: {:?}", event);
+                tracing::info!("✓ Received KV event: {:?}", event);
             }
             Err(e) => {
                 return Err(Error::msg(format!("Failed to deserialize KV event: {}", e)));
@@ -570,7 +570,7 @@ mod integration_tests {
         tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
 
         let processed_endpoints = metrics_aggregator.get_endpoints();
-        println!(
+        tracing::info!(
             "Found {} metrics endpoints",
             processed_endpoints.endpoints.len()
         );
@@ -580,17 +580,17 @@ mod integration_tests {
             !processed_endpoints.endpoints.is_empty(),
             "Should find at least one metrics endpoint"
         );
-        println!(
+        tracing::info!(
             "✓ Successfully found {} metrics endpoints",
             processed_endpoints.endpoints.len()
         );
 
         // Verify the metrics endpoints contain valid data
         for (worker_id, endpoint) in &processed_endpoints.endpoints {
-            println!("✓ Worker {} metrics: {:?}", worker_id, endpoint.data);
+            tracing::info!("✓ Worker {} metrics: {:?}", worker_id, endpoint.data);
         }
 
-        println!("🎉 Event verification completed!");
+        tracing::info!("🎉 Event verification completed!");
 
         // Cleanup
         distributed.shutdown();

From fe20aa301566b161551d8c453570ef30303d6c04 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 16 Jun 2025 16:47:07 -0700
Subject: [PATCH 13/36] async engine core

---
 lib/llm/src/mocker/engine.rs | 176 ++++++++++++++++++++++++-----------
 1 file changed, 120 insertions(+), 56 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 34f137d661..c7405d95e0 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -19,15 +19,17 @@
 //! to provide streaming token generation with realistic timing simulation.
 
 use crate::kv_router::publisher::WorkerMetricsPublisher;
-use crate::mocker::protocols::{DirectRequest, MockEngineArgs, OutputSignal};
+use crate::mocker::protocols::DirectRequest;
+use crate::mocker::protocols::{MockEngineArgs, OutputSignal};
 use crate::mocker::scheduler::Scheduler;
+use crate::protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest};
+use crate::protocols::TokenIdType;
 use tokio_util::sync::CancellationToken;
 
 use dynamo_runtime::{
     component::Component,
     engine::AsyncEngineContextProvider,
     pipeline::{async_trait, AsyncEngine, Error, ManyOut, ResponseStream, SingleIn},
-    protocols::annotated::Annotated,
     traits::DistributedRuntimeProvider,
     Result,
 };
@@ -42,22 +44,16 @@ use tokio::time::{interval, Duration};
 use tokio_stream::wrappers::ReceiverStream;
 use uuid::Uuid;
 
-/// Generate a random printable character
-fn generate_random_char() -> String {
+/// Generate a random token ID from 0 to 50k
+fn generate_random_token() -> TokenIdType {
     let mut rng = rand::rng();
-    let selection = match rng.random_range(0..4) {
-        0 => ('a'..='z').nth(rng.random_range(0..26)).unwrap(), // lowercase
-        1 => ('A'..='Z').nth(rng.random_range(0..26)).unwrap(), // uppercase
-        2 => ('0'..='9').nth(rng.random_range(0..10)).unwrap(), // digits
-        _ => [' ', '.', ',', '!', '?'][rng.random_range(0..5)], // punctuation/space
-    };
-    selection.to_string()
+    rng.random_range(1..50000)
 }
 
 /// AsyncEngine wrapper around the Scheduler that generates random character tokens
 pub struct MockVllmEngine {
     schedulers: Vec<Scheduler>,
-    active_requests: Arc<Mutex<HashMap<Uuid, mpsc::Sender<OutputSignal>>>>,
+    active_requests: Arc<Mutex<HashMap<Uuid, mpsc::UnboundedSender<OutputSignal>>>>,
     dp_size: u32,
     cancel_token: CancellationToken,
 }
@@ -69,9 +65,10 @@ impl MockVllmEngine {
         component: Option<Component>,
         cancel_token: Option<CancellationToken>,
     ) -> Result<Self> {
-        let active_requests = Arc::new(Mutex::new(
-            HashMap::<Uuid, mpsc::Sender<OutputSignal>>::new(),
-        ));
+        let active_requests = Arc::new(Mutex::new(HashMap::<
+            Uuid,
+            mpsc::UnboundedSender<OutputSignal>,
+        >::new()));
 
         let cancel_token = cancel_token.unwrap_or_default();
 
@@ -105,7 +102,7 @@ impl MockVllmEngine {
     /// Returns schedulers and their corresponding KV event receivers
     fn start_schedulers(
         args: MockEngineArgs,
-        active_requests: Arc<Mutex<HashMap<Uuid, mpsc::Sender<OutputSignal>>>>,
+        active_requests: Arc<Mutex<HashMap<Uuid, mpsc::UnboundedSender<OutputSignal>>>>,
         cancel_token: CancellationToken,
     ) -> (
         Vec<Scheduler>,
@@ -152,7 +149,7 @@ impl MockVllmEngine {
                             // Notify the specific request that a token was generated
                             let active = active_requests_clone.lock().await;
                             if let Some(request_tx) = active.get(&signal.uuid) {
-                                let _ = request_tx.send(signal).await;
+                                let _ = request_tx.send(signal);
                             }
                         }
                         _ = cancel_token_cloned.cancelled() => {
@@ -305,14 +302,27 @@ impl MockVllmEngine {
 }
 
 #[async_trait]
-impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for MockVllmEngine {
+impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
+    for MockVllmEngine
+{
     async fn generate(
         &self,
-        input: SingleIn<DirectRequest>,
-    ) -> Result<ManyOut<Annotated<String>>, Error> {
-        let (mut request, ctx) = input.into_parts();
-
-        let dp_rank = request.dp_rank.unwrap_or(0);
+        input: SingleIn<PreprocessedRequest>,
+    ) -> Result<ManyOut<LLMEngineOutput>, Error> {
+        let (request, ctx) = input.into_parts();
+
+        // Extract dp_rank from annotations if present
+        let dp_rank = request
+            .annotations
+            .iter()
+            .find_map(|ann| {
+                if ann.starts_with("dp_rank:") {
+                    ann.strip_prefix("dp_rank:").and_then(|s| s.parse().ok())
+                } else {
+                    None
+                }
+            })
+            .unwrap_or(0);
 
         // Validate dp_rank
         if dp_rank >= self.dp_size {
@@ -323,9 +333,20 @@ impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for
         }
 
         let request_uuid = ctx.id().parse().unwrap_or(Uuid::new_v4());
-        request.uuid = Some(request_uuid);
 
-        let (request_tx, mut request_rx) = mpsc::channel::<OutputSignal>(64);
+        // Convert PreprocessedRequest to DirectRequest for scheduler
+        let direct_request = DirectRequest {
+            tokens: request.token_ids.clone(),
+            max_output_tokens: request
+                .stop_conditions
+                .max_tokens
+                .expect("max_output_tokens must be specified for mocker")
+                as usize,
+            uuid: Some(request_uuid),
+            dp_rank: Some(dp_rank),
+        };
+
+        let (request_tx, mut request_rx) = mpsc::unbounded_channel::<OutputSignal>();
         {
             let mut active = self.active_requests.lock().await;
             active.insert(request_uuid, request_tx);
@@ -333,35 +354,61 @@ impl AsyncEngine<SingleIn<DirectRequest>, ManyOut<Annotated<String>>, Error> for
 
         // Send the request to the appropriate scheduler based on dp_rank
         self.schedulers[dp_rank as usize]
-            .receive(request.clone())
+            .receive(direct_request)
             .await;
 
         // Create a simple channel for the stream
-        let (stream_tx, stream_rx) = mpsc::channel::<Annotated<String>>(64);
+        let (stream_tx, stream_rx) = mpsc::channel::<LLMEngineOutput>(64);
 
         let active_requests = self.active_requests.clone();
         let async_context = ctx.context();
         let cancel_token = self.cancel_token.clone();
+        let max_tokens = request.stop_conditions.max_tokens.unwrap_or(100) as usize;
 
         // Spawn a task to handle the complex async logic
         tokio::spawn(async move {
+            let mut token_count = 0;
+
             loop {
                 tokio::select! {
                     Some(signal) = request_rx.recv() => {
-                        if signal.completed {
+                        if signal.completed || token_count >= max_tokens {
+                            // Send final output with finish reason
+                            let final_output = if token_count >= max_tokens {
+                                LLMEngineOutput::length()
+                            } else {
+                                LLMEngineOutput::stop()
+                            };
+
+                            let _ = stream_tx.send(final_output).await;
                             break;
                         }
-                        let output = generate_random_char();
-                        if stream_tx.send(Annotated::from_data(output)).await.is_err() {
+
+                        // Generate a new token
+                        let token_id = generate_random_token();
+                        token_count += 1;
+
+                        let output = LLMEngineOutput {
+                            token_ids: vec![token_id],
+                            tokens: None,  // Let backend handle detokenization
+                            text: None,
+                            cum_log_probs: None,
+                            log_probs: None,
+                            finish_reason: None,
+                        };
+
+                        if stream_tx.send(output).await.is_err() {
                             break;
                         }
                     }
 
                     _ = async_context.stopped() => {
+                        let _ = stream_tx.send(LLMEngineOutput::cancelled()).await;
                         break;
                     }
 
                     _ = cancel_token.cancelled() => {
+                        let _ = stream_tx.send(LLMEngineOutput::cancelled()).await;
                         break;
                     }
                 }
@@ -383,6 +430,7 @@ mod integration_tests {
     use super::*;
     use crate::kv_router::indexer::RouterEvent;
     use crate::kv_router::KV_EVENT_SUBJECT;
+    use crate::protocols::common::{SamplingOptions, StopConditions};
     use dynamo_runtime::{
         pipeline::Context,
         pipeline::{network::Ingress, PushRouter},
@@ -486,11 +534,17 @@ mod integration_tests {
         tracing::info!("✓ Router created");
 
         // Create test requests for both DP workers
-        let create_request = |tokens: Vec<u32>, dp_rank: u32| DirectRequest {
-            tokens,
-            max_output_tokens: TOKENS_PER_REQUEST,
-            uuid: None,
-            dp_rank: Some(dp_rank),
+        let create_request = |tokens: Vec<TokenIdType>, dp_rank: u32| PreprocessedRequest {
+            token_ids: tokens,
+            stop_conditions: StopConditions {
+                max_tokens: Some(TOKENS_PER_REQUEST as u32),
+                ..Default::default()
+            },
+            sampling_options: SamplingOptions::default(),
+            eos_token_ids: vec![],
+            mdc_sum: None,
+            annotations: vec![format!("dp_rank:{}", dp_rank)],
+            estimated_prefix_hit_num_blocks: None,
         };
 
         let requests = vec![
@@ -509,36 +563,46 @@ mod integration_tests {
             tracing::info!("Testing request {}", i + 1);
 
             let response_stream = router.generate(Context::new(request)).await?;
-            let responses: Vec<Annotated<String>> = response_stream.collect().await;
+            let responses: Vec<LLMEngineOutput> = response_stream.collect().await;
 
-            // Verify each stream produces exactly the expected number of tokens
-            assert_eq!(
-                responses.len(),
-                TOKENS_PER_REQUEST,
-                "Request {} should produce {} tokens, got {}",
-                i + 1,
-                TOKENS_PER_REQUEST,
-                responses.len()
+            // Should have at least one response
+            assert!(
+                !responses.is_empty(),
+                "Request {} should produce at least one response",
+                i + 1
             );
 
-            // Verify all tokens contain valid data
-            for (j, token) in responses.iter().enumerate() {
-                if let Some(char_data) = &token.data {
-                    assert!(
-                        !char_data.is_empty(),
-                        "Request {} token {} should not be empty",
-                        i + 1,
-                        j + 1
-                    );
-                } else {
-                    panic!("Request {} token {} should have data", i + 1, j + 1);
+            // Count total tokens generated (excluding final message)
+            let mut total_tokens = 0;
+            let mut has_finish_reason = false;
+
+            for response in &responses {
+                total_tokens += response.token_ids.len();
+                if response.finish_reason.is_some() {
+                    has_finish_reason = true;
                 }
             }
 
+            // Should have a finish reason in the last response
+            assert!(
+                has_finish_reason,
+                "Request {} should have a finish reason",
+                i + 1
+            );
+
+            // Verify we got approximately the expected number of tokens
+            assert!(
+                total_tokens <= TOKENS_PER_REQUEST + 1, // +1 for potential final empty response
+                "Request {} generated {} tokens, expected at most {}",
+                i + 1,
+                total_tokens,
+                TOKENS_PER_REQUEST + 1
+            );
+
             tracing::info!(
                 "✓ Request {} completed successfully with {} tokens",
                 i + 1,
-                responses.len()
+                total_tokens
             );
         }
 

From 2fbf998f8d1e1651735ae2d2b71d49a604125e24 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 16 Jun 2025 17:20:00 -0700
Subject: [PATCH 14/36] hook up with dynamo run

---
 launch/dynamo-run/src/flags.rs | 36 ++++++++++++++++++++++++----
 launch/dynamo-run/src/lib.rs   | 44 ++++++++++++++++++++++++++++++++++
 launch/dynamo-run/src/opt.rs   | 12 +++++++++-
 lib/llm/src/mocker/engine.rs   | 41 +++++++++++++++++++++++++++++--
 4 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
index 2ac7286302..961fb2f800 100644
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -151,6 +151,18 @@ pub struct Flags {
     /// These are the command line arguments to the python engine when using `pystr` or `pytok`.
     #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
     pub last: Vec<String>,
+
+    /// Mocker engine configuration from a JSON file.
+    /// Example file contents:
+    /// {
+    ///     "speedup_ratio": 1.0,
+    ///     "dp_size": 1,
+    ///     "num_gpu_blocks": 16384,
+    ///     "max_num_batched_tokens": 8192,
+    ///     "watermark": 0.01
+    /// }
+    #[arg(long)]
+    pub extra_mocker_args: Option<PathBuf>,
 }
 
 impl Flags {
@@ -216,12 +228,12 @@ impl Flags {
         out
     }
 
-    /// Load extra engine arguments from a JSON file
+    /// Load extra arguments from a JSON file
     /// Returns a HashMap of parameter names to values
-    pub fn load_extra_engine_args(
-        &self,
+    fn load_json_args(
+        path: &Option<PathBuf>,
     ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
-        if let Some(path) = &self.extra_engine_args {
+        if let Some(path) = path {
             let file_content = std::fs::read_to_string(path)?;
             let args: HashMap<String, serde_json::Value> = serde_json::from_str(&file_content)?;
             Ok(Some(args))
@@ -229,6 +241,22 @@ impl Flags {
             Ok(None)
         }
     }
+
+    /// Load extra engine arguments from a JSON file
+    /// Returns a HashMap of parameter names to values
+    pub fn load_extra_engine_args(
+        &self,
+    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
+        Self::load_json_args(&self.extra_engine_args)
+    }
+
+    /// Load extra mocker arguments from a JSON file
+    /// Returns a HashMap of parameter names to values
+    pub fn load_extra_mocker_args(
+        &self,
+    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
+        Self::load_json_args(&self.extra_mocker_args)
+    }
 }
 
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 82f0841a28..c2b972721e 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -285,6 +285,50 @@ pub async fn run(
                 model: Box::new(local_model),
             }
         }
+
+        Output::Mocker => {
+            // Load mocker args from JSON file if provided
+            let mocker_args = flags.load_extra_mocker_args()?;
+
+            let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder();
+
+            // Use kv_cache_block_size flag as block_size if provided
+            if let Some(block_size) = flags.kv_cache_block_size {
+                builder = builder.block_size(block_size);
+            }
+
+            // Apply args from JSON file if provided
+            if let Some(args) = mocker_args {
+                if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) {
+                    builder = builder.speedup_ratio(v);
+                }
+                if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) {
+                    builder = builder.dp_size(v as u32);
+                }
+                if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) {
+                    builder = builder.num_gpu_blocks(v as usize);
+                }
+                if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) {
+                    builder = builder.max_num_batched_tokens(Some(v as usize));
+                }
+                if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) {
+                    builder = builder.watermark(v);
+                }
+            }
+
+            let args = builder
+                .build()
+                .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {}", e))?;
+
+            let engine = dynamo_llm::mocker::engine::make_mocker_engine(args)
+                .await
+                .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?;
+
+            EngineConfig::StaticCore {
+                engine,
+                model: Box::new(local_model),
+            }
+        }
     };
 
     match in_opt {
diff --git a/launch/dynamo-run/src/opt.rs b/launch/dynamo-run/src/opt.rs
index 25ab953eb8..3e0d708206 100644
--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -90,6 +90,9 @@ pub enum Output {
     /// Listen for models on nats/etcd, add/remove dynamically
     Dynamic,
 
+    /// Mock vLLM engine for testing and development
+    Mocker,
+
     #[cfg(feature = "mistralrs")]
     /// Run inference on a model in a GGUF file using mistralrs w/ candle
     MistralRs,
@@ -126,6 +129,7 @@ impl TryFrom<&str> for Output {
 
             "echo_full" => Ok(Output::EchoFull),
             "echo_core" => Ok(Output::EchoCore),
+            "mocker" => Ok(Output::Mocker),
 
             "dyn" => Ok(Output::Dynamic),
 
@@ -160,6 +164,8 @@ impl fmt::Display for Output {
             Output::EchoCore => "echo_core",
 
             Output::Dynamic => "dyn",
+
+            Output::Mocker => "mocker",
         };
         write!(f, "{s}")
     }
@@ -168,7 +174,11 @@ impl fmt::Display for Output {
 impl Output {
     #[allow(unused_mut)]
     pub fn available_engines() -> Vec<String> {
-        let mut out = vec!["echo_core".to_string(), "echo_full".to_string()];
+        let mut out = vec![
+            "echo_core".to_string(),
+            "echo_full".to_string(),
+            "mocker".to_string(),
+        ];
         #[cfg(feature = "mistralrs")]
         {
             out.push(Output::MistralRs.to_string());
diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index c7405d95e0..ca8a1f6d26 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -19,6 +19,7 @@
 //! to provide streaming token generation with realistic timing simulation.
 
 use crate::kv_router::publisher::WorkerMetricsPublisher;
+use dynamo_runtime::protocols::annotated::Annotated;
 use crate::mocker::protocols::DirectRequest;
 use crate::mocker::protocols::{MockEngineArgs, OutputSignal};
 use crate::mocker::scheduler::Scheduler;
@@ -43,11 +44,12 @@ use tokio::sync::{mpsc, Mutex};
 use tokio::time::{interval, Duration};
 use tokio_stream::wrappers::ReceiverStream;
 use uuid::Uuid;
+use futures::StreamExt;
 
-/// Generate a random token ID from 0 to 50k
+/// Generate a random token ID from 0 to 5k
 fn generate_random_token() -> TokenIdType {
     let mut rng = rand::rng();
-    rng.random_range(1..50000)
+    rng.random_range(1..5000)
 }
 
 /// AsyncEngine wrapper around the Scheduler that generates random character tokens
@@ -425,6 +427,41 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
     }
 }
 
+pub struct AnnotatedMockEngine {
+    inner: Arc<MockVllmEngine>,
+}
+
+impl AnnotatedMockEngine {
+    pub fn new(inner: Arc<MockVllmEngine>) -> Self {
+        Self { inner }
+    }
+}
+
+#[async_trait]
+impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutput>>, Error>
+    for AnnotatedMockEngine
+{
+    async fn generate(
+        &self,
+        input: SingleIn<PreprocessedRequest>,
+    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
+        let stream = self.inner.generate(input).await?;
+        let context = stream.context();
+
+        // Convert stream of LLMEngineOutput to Annotated<LLMEngineOutput>
+        let annotated_stream = stream.map(Annotated::from_data);
+
+        Ok(ResponseStream::new(Box::pin(annotated_stream), context))
+    }
+}
+
+/// Create a mocker engine as ExecutionContext
+pub async fn make_mocker_engine(args: MockEngineArgs) -> Result<crate::backend::ExecutionContext, Error> {
+    let engine = MockVllmEngine::new(args, None, None).await?;
+    let annotated = AnnotatedMockEngine::new(Arc::new(engine));
+    Ok(Arc::new(annotated))
+}
+
 #[cfg(test)]
 mod integration_tests {
     use super::*;

From b5480503e2919ef7682cb45b57317b097499498c Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 16 Jun 2025 17:44:03 -0700
Subject: [PATCH 15/36] docs

---
 docs/guides/dynamo_run.md | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index 2f980f1158..702f628704 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--extra-mocker-args=args_mocker.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
@@ -514,6 +514,39 @@ The output looks like this:
 {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
 ```
 
+#### Mocker engine
+
+The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:
+
+- Testing distributed system components without GPU resources
+- Benchmarking infrastructure and networking overhead
+- Developing and debugging Dynamo components
+- Load testing and performance analysis
+
+**Basic usage:**
+
+```bash
+dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0
+```
+
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights.
+
+Available options:
+- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
+- `dp_size`: Number of data parallel workers to simulate (default: 1)
+- `num_gpu_blocks`: Number of GPU blocks to simulate for the KV cache (default: 16384). This is normally calculated automatically by the real vllm engine based on the VRAM size and model KV cache size.
+- `max_num_batched_tokens`: Maximum number of tokens that can be batched together (default: 8192)
+- `watermark`: KV cache watermark threshold as a fraction (default: 0.01)
+
+**Example with custom settings:**
+```bash
+# Create configuration file
+echo '{"speedup_ratio": 10.0, "dp_size": 4}' > mocker_args.json
+
+# Run mocker with configuration
+dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json
+```
+
 ### Extra engine arguments
 The vllm and sglang backends support passing any argument the engine accepts.
 Put the arguments in a JSON file:

From c7c4be5d57104d1d223bece851b503a65c000c1d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 16 Jun 2025 17:51:20 -0700
Subject: [PATCH 16/36]  fmt

---
 lib/llm/src/mocker/engine.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index ca8a1f6d26..3015b7eb58 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -19,12 +19,12 @@
 //! to provide streaming token generation with realistic timing simulation.
 
 use crate::kv_router::publisher::WorkerMetricsPublisher;
-use dynamo_runtime::protocols::annotated::Annotated;
 use crate::mocker::protocols::DirectRequest;
 use crate::mocker::protocols::{MockEngineArgs, OutputSignal};
 use crate::mocker::scheduler::Scheduler;
 use crate::protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest};
 use crate::protocols::TokenIdType;
+use dynamo_runtime::protocols::annotated::Annotated;
 use tokio_util::sync::CancellationToken;
 
 use dynamo_runtime::{
@@ -37,6 +37,7 @@ use dynamo_runtime::{
 
 use crate::kv_router::protocols::{KvCacheEvent, KvCacheEventData};
 use crate::kv_router::publisher::KvEventPublisher;
+use futures::StreamExt;
 use rand::Rng;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -44,7 +45,6 @@ use tokio::sync::{mpsc, Mutex};
 use tokio::time::{interval, Duration};
 use tokio_stream::wrappers::ReceiverStream;
 use uuid::Uuid;
-use futures::StreamExt;
 
 /// Generate a random token ID from 0 to 5k
 fn generate_random_token() -> TokenIdType {
@@ -456,7 +456,9 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
 }
 
 /// Create a mocker engine as ExecutionContext
-pub async fn make_mocker_engine(args: MockEngineArgs) -> Result<crate::backend::ExecutionContext, Error> {
+pub async fn make_mocker_engine(
+    args: MockEngineArgs,
+) -> Result<crate::backend::ExecutionContext, Error> {
     let engine = MockVllmEngine::new(args, None, None).await?;
     let annotated = AnnotatedMockEngine::new(Arc::new(engine));
     Ok(Arc::new(annotated))

From 3ad77807e402afc1ecb953ca4da7fff0b42edf7d Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 16 Jun 2025 21:07:57 -0700
Subject: [PATCH 17/36] refactor

---
 lib/llm/src/mocker/engine.rs | 104 ++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 49 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 3015b7eb58..7b11867535 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -46,6 +46,8 @@ use tokio::time::{interval, Duration};
 use tokio_stream::wrappers::ReceiverStream;
 use uuid::Uuid;
 
+pub const MOCKER_COMPONENT: &str = "mocker";
+
 /// Generate a random token ID from 0 to 5k
 fn generate_random_token() -> TokenIdType {
     let mut rng = rand::rng();
@@ -53,56 +55,56 @@ fn generate_random_token() -> TokenIdType {
 }
 
 /// AsyncEngine wrapper around the Scheduler that generates random character tokens
+#[derive(Clone)]
 pub struct MockVllmEngine {
-    schedulers: Vec<Scheduler>,
     active_requests: Arc<Mutex<HashMap<Uuid, mpsc::UnboundedSender<OutputSignal>>>>,
-    dp_size: u32,
-    cancel_token: CancellationToken,
+    schedulers: Option<Vec<Scheduler>>,
+    engine_args: MockEngineArgs,
 }
 
 impl MockVllmEngine {
     /// Create a new MockVllmEngine with the given parameters
-    pub async fn new(
-        args: MockEngineArgs,
-        component: Option<Component>,
-        cancel_token: Option<CancellationToken>,
-    ) -> Result<Self> {
+    pub fn new(args: MockEngineArgs) -> Self {
         let active_requests = Arc::new(Mutex::new(HashMap::<
             Uuid,
             mpsc::UnboundedSender<OutputSignal>,
         >::new()));
 
-        let cancel_token = cancel_token.unwrap_or_default();
+        Self {
+            active_requests,
+            schedulers: None,
+            engine_args: args,
+        }
+    }
 
-        // Create schedulers and get their KV event receivers
-        let (schedulers, kv_event_receivers) =
-            Self::start_schedulers(args.clone(), active_requests.clone(), cancel_token.clone());
+    pub async fn start(&mut self, component: Component) -> Result<()> {
+        let cancel_token = component.drt().runtime().child_token();
 
-        Self::start_metrics_publishing(&schedulers, component.clone(), cancel_token.clone())
+        let (schedulers, kv_event_receiver) = self.start_schedulers(
+            self.engine_args.clone(),
+            self.active_requests.clone(),
+            cancel_token.clone(),
+        );
+
+        Self::start_metrics_publishing(&schedulers, Some(component.clone()), cancel_token.clone())
             .await?;
 
         // Start KV events publishing with the actual receivers from schedulers
         Self::start_kv_events_publishing(
-            kv_event_receivers,
-            component.clone(),
-            args.block_size,
+            kv_event_receiver,
+            Some(component.clone()),
+            self.engine_args.block_size,
             cancel_token.clone(),
         )
         .await?;
 
-        let engine = Self {
-            schedulers,
-            active_requests,
-            dp_size: args.dp_size,
-            cancel_token,
-        };
-
-        Ok(engine)
+        Ok(())
     }
 
     /// Create schedulers and spawn their background tasks for distributing token notifications
     /// Returns schedulers and their corresponding KV event receivers
     fn start_schedulers(
+        &mut self,
         args: MockEngineArgs,
         active_requests: Arc<Mutex<HashMap<Uuid, mpsc::UnboundedSender<OutputSignal>>>>,
         cancel_token: CancellationToken,
@@ -110,13 +112,13 @@ impl MockVllmEngine {
         Vec<Scheduler>,
         Vec<mpsc::UnboundedReceiver<KvCacheEventData>>,
     ) {
-        let mut schedulers = Vec::new();
+        let mut schedulers = Vec::<Scheduler>::new();
         let mut kv_event_receivers = Vec::new();
 
         // Create multiple schedulers and their background tasks
         for dp_rank in 0..args.dp_size {
             // Create a shared output channel that this scheduler will use
-            let (output_tx, output_rx) = mpsc::unbounded_channel::<OutputSignal>();
+            let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
 
             // Create a channel for KV events from this scheduler
             let (kv_events_tx, kv_events_rx) = mpsc::unbounded_channel::<KvCacheEventData>();
@@ -133,17 +135,14 @@ impl MockVllmEngine {
             kv_event_receivers.push(kv_events_rx);
 
             // Spawn a background task for this scheduler to distribute token notifications to active requests
-            let output_rx = Arc::new(Mutex::new(output_rx));
+            // let output_rx = Arc::new(Mutex::new(output_rx));
             let active_requests_clone = active_requests.clone();
             let cancel_token_cloned = cancel_token.clone();
 
             tokio::spawn(async move {
                 loop {
                     tokio::select! {
-                        signal_result = async {
-                            let mut rx = output_rx.lock().await;
-                            rx.recv().await
-                        } => {
+                        signal_result = output_rx.recv() => {
                             let Some(signal) = signal_result else {
                                 break; // Channel closed
                             };
@@ -162,6 +161,7 @@ impl MockVllmEngine {
             });
         }
 
+        self.schedulers = Some(schedulers.clone());
         (schedulers, kv_event_receivers)
     }
 
@@ -327,10 +327,10 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
             .unwrap_or(0);
 
         // Validate dp_rank
-        if dp_rank >= self.dp_size {
+        if dp_rank >= self.engine_args.dp_size {
             return Err(Error::msg(format!(
                 "dp_rank {} is out of bounds for dp_size {}",
-                dp_rank, self.dp_size
+                dp_rank, self.engine_args.dp_size
             )));
         }
 
@@ -355,7 +355,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
         }
 
         // Send the request to the appropriate scheduler based on dp_rank
-        self.schedulers[dp_rank as usize]
+        self.schedulers.as_ref().unwrap()[dp_rank as usize]
             .receive(direct_request)
             .await;
 
@@ -364,7 +364,6 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
 
         let active_requests = self.active_requests.clone();
         let async_context = ctx.context();
-        let cancel_token = self.cancel_token.clone();
         let max_tokens = request.stop_conditions.max_tokens.unwrap_or(100) as usize;
 
         // Spawn a task to handle the complex async logic
@@ -373,7 +372,11 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
 
             loop {
                 tokio::select! {
-                    Some(signal) = request_rx.recv() => {
+                    maybe_signal = request_rx.recv() => {
+                        let Some(signal) = maybe_signal else {
+                            break;
+                        };
+
                         if signal.completed || token_count >= max_tokens {
                             // Send final output with finish reason
                             let final_output = if token_count >= max_tokens {
@@ -408,11 +411,6 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
                         let _ = stream_tx.send(LLMEngineOutput::cancelled()).await;
                         break;
                     }
-
-                    _ = cancel_token.cancelled() => {
-                        let _ = stream_tx.send(LLMEngineOutput::cancelled()).await;
-                        break;
-                    }
                 }
             }
 
@@ -428,13 +426,18 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
 }
 
 pub struct AnnotatedMockEngine {
-    inner: Arc<MockVllmEngine>,
+    inner: MockVllmEngine,
 }
 
 impl AnnotatedMockEngine {
-    pub fn new(inner: Arc<MockVllmEngine>) -> Self {
+    pub fn new(inner: MockVllmEngine) -> Self {
         Self { inner }
     }
+
+    pub async fn start(&self, component: Component) -> Result<()> {
+        self.inner.clone().start(component).await?;
+        Ok(())
+    }
 }
 
 #[async_trait]
@@ -459,9 +462,9 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
 pub async fn make_mocker_engine(
     args: MockEngineArgs,
 ) -> Result<crate::backend::ExecutionContext, Error> {
-    let engine = MockVllmEngine::new(args, None, None).await?;
-    let annotated = AnnotatedMockEngine::new(Arc::new(engine));
-    Ok(Arc::new(annotated))
+    Ok(Arc::new(AnnotatedMockEngine::new(MockVllmEngine::new(
+        args,
+    ))))
 }
 
 #[cfg(test)]
@@ -495,7 +498,7 @@ mod integration_tests {
         // Create component for MockVllmEngine (needed for publishers)
         let test_component = distributed
             .namespace("test")?
-            .component("mock-vllm")?
+            .component(MOCKER_COMPONENT)?
             .service_builder()
             .create()
             .await?;
@@ -509,7 +512,10 @@ mod integration_tests {
             .build()
             .unwrap();
 
-        let engine = Arc::new(MockVllmEngine::new(args, Some(test_component.clone()), None).await?);
+        let mut engine = MockVllmEngine::new(args);
+        engine.start(test_component.clone()).await?;
+        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+        let engine = Arc::new(engine);
         tracing::info!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE);
 
         // Set up KV events subscriber
@@ -563,7 +569,7 @@ mod integration_tests {
         // Create client
         let client = distributed
             .namespace("test")?
-            .component("mock-vllm")?
+            .component(MOCKER_COMPONENT)?
             .endpoint("generate")
             .client()
             .await?;

From c78bef274c1532e753cb4f526a2ecebd96c7d138 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 17 Jun 2025 01:23:26 -0700
Subject: [PATCH 18/36] works with kv router

---
 docs/guides/dynamo_run.md       |  13 +---
 launch/dynamo-run/src/lib.rs    |  17 +++--
 lib/llm/src/mocker/engine.rs    | 122 ++++++++++++++++++++++----------
 lib/llm/src/mocker/scheduler.rs |   5 ++
 4 files changed, 107 insertions(+), 50 deletions(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index aec0b9d308..517e8381bb 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -525,10 +525,6 @@ The mocker engine is a mock vLLM implementation designed for testing and develop
 
 **Basic usage:**
 
-```bash
-dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0
-```
-
 The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights.
 
 Available options:
@@ -538,13 +534,10 @@ Available options:
 - `max_num_batched_tokens`: Maximum number of tokens that can be batched together (default: 8192)
 - `watermark`: KV cache watermark threshold as a fraction (default: 0.01)
 
-**Example with custom settings:**
 ```bash
-# Create configuration file
-echo '{"speedup_ratio": 10.0, "dp_size": 4}' > mocker_args.json
-
-# Run mocker with configuration
-dynamo-run in=http out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json
+echo '{"speedup_ratio": 10.0}' > mocker_args.json
+dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0
+dynamo-run in=http out=dyn --router-mode kv
 ```
 
 ### Extra engine arguments
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index c2b972721e..11e70830c1 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -58,6 +58,7 @@ pub async fn run(
         anyhow::bail!("Cannot use endpoint for both in and out");
     }
 
+    let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
     let cancel_token = runtime.primary_token();
     let maybe_path = flags
         .model_path_pos
@@ -287,6 +288,11 @@ pub async fn run(
         }
 
         Output::Mocker => {
+            let endpoint = match &in_opt {
+                Input::Endpoint(path) => path.parse()?,
+                _ => internal_endpoint("mocker"),
+            };
+
             // Load mocker args from JSON file if provided
             let mocker_args = flags.load_extra_mocker_args()?;
 
@@ -320,9 +326,13 @@ pub async fn run(
                 .build()
                 .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {}", e))?;
 
-            let engine = dynamo_llm::mocker::engine::make_mocker_engine(args)
-                .await
-                .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?;
+            let engine = dynamo_llm::mocker::engine::make_mocker_engine(
+                distributed_runtime.clone(),
+                endpoint,
+                args,
+            )
+            .await
+            .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?;
 
             EngineConfig::StaticCore {
                 engine,
@@ -355,7 +365,6 @@ pub async fn run(
                 .await?;
         }
         Input::Endpoint(path) => {
-            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
             crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
         }
     }
diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 7b11867535..d4b8272007 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -25,6 +25,7 @@ use crate::mocker::scheduler::Scheduler;
 use crate::protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest};
 use crate::protocols::TokenIdType;
 use dynamo_runtime::protocols::annotated::Annotated;
+use dynamo_runtime::DistributedRuntime;
 use tokio_util::sync::CancellationToken;
 
 use dynamo_runtime::{
@@ -41,43 +42,38 @@ use futures::StreamExt;
 use rand::Rng;
 use std::collections::HashMap;
 use std::sync::Arc;
-use tokio::sync::{mpsc, Mutex};
+use tokio::sync::{mpsc, Mutex, OnceCell};
 use tokio::time::{interval, Duration};
 use tokio_stream::wrappers::ReceiverStream;
 use uuid::Uuid;
 
 pub const MOCKER_COMPONENT: &str = "mocker";
 
-/// Generate a random token ID from 0 to 5k
+/// Generate a random token ID from 1k to 5k
 fn generate_random_token() -> TokenIdType {
     let mut rng = rand::rng();
-    rng.random_range(1..5000)
+    rng.random_range(1000..5000)
 }
 
 /// AsyncEngine wrapper around the Scheduler that generates random character tokens
 #[derive(Clone)]
 pub struct MockVllmEngine {
     active_requests: Arc<Mutex<HashMap<Uuid, mpsc::UnboundedSender<OutputSignal>>>>,
-    schedulers: Option<Vec<Scheduler>>,
+    request_senders: Arc<OnceCell<Vec<mpsc::UnboundedSender<DirectRequest>>>>,
     engine_args: MockEngineArgs,
 }
 
 impl MockVllmEngine {
     /// Create a new MockVllmEngine with the given parameters
     pub fn new(args: MockEngineArgs) -> Self {
-        let active_requests = Arc::new(Mutex::new(HashMap::<
-            Uuid,
-            mpsc::UnboundedSender<OutputSignal>,
-        >::new()));
-
         Self {
-            active_requests,
-            schedulers: None,
+            active_requests: Arc::new(Mutex::new(HashMap::new())),
+            request_senders: Arc::new(OnceCell::new()),
             engine_args: args,
         }
     }
 
-    pub async fn start(&mut self, component: Component) -> Result<()> {
+    pub async fn start(&self, component: Component) -> Result<()> {
         let cancel_token = component.drt().runtime().child_token();
 
         let (schedulers, kv_event_receiver) = self.start_schedulers(
@@ -101,10 +97,15 @@ impl MockVllmEngine {
         Ok(())
     }
 
+    pub fn direct(&self, request: DirectRequest, dp_rank: usize) {
+        let senders = self.request_senders.get().expect("Not initialized");
+        let _ = senders[dp_rank].send(request);
+    }
+
     /// Create schedulers and spawn their background tasks for distributing token notifications
     /// Returns schedulers and their corresponding KV event receivers
     fn start_schedulers(
-        &mut self,
+        &self,
         args: MockEngineArgs,
         active_requests: Arc<Mutex<HashMap<Uuid, mpsc::UnboundedSender<OutputSignal>>>>,
         cancel_token: CancellationToken,
@@ -114,6 +115,7 @@ impl MockVllmEngine {
     ) {
         let mut schedulers = Vec::<Scheduler>::new();
         let mut kv_event_receivers = Vec::new();
+        let mut senders = Vec::with_capacity(args.dp_size as usize);
 
         // Create multiple schedulers and their background tasks
         for dp_rank in 0..args.dp_size {
@@ -131,6 +133,7 @@ impl MockVllmEngine {
                 Some(cancel_token.clone()),
             );
 
+            senders.push(scheduler.request_sender());
             schedulers.push(scheduler);
             kv_event_receivers.push(kv_events_rx);
 
@@ -161,7 +164,11 @@ impl MockVllmEngine {
             });
         }
 
-        self.schedulers = Some(schedulers.clone());
+        // Set the senders once
+        self.request_senders
+            .set(senders)
+            .expect("Already initialized");
+
         (schedulers, kv_event_receivers)
     }
 
@@ -355,9 +362,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
         }
 
         // Send the request to the appropriate scheduler based on dp_rank
-        self.schedulers.as_ref().unwrap()[dp_rank as usize]
-            .receive(direct_request)
-            .await;
+        self.direct(direct_request, dp_rank as usize);
 
         // Create a simple channel for the stream
         let (stream_tx, stream_rx) = mpsc::channel::<LLMEngineOutput>(64);
@@ -374,18 +379,17 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
                 tokio::select! {
                     maybe_signal = request_rx.recv() => {
                         let Some(signal) = maybe_signal else {
+                            let _ = stream_tx.send(LLMEngineOutput::error("All output transmitters closed".to_string())).await;
                             break;
                         };
 
-                        if signal.completed || token_count >= max_tokens {
-                            // Send final output with finish reason
-                            let final_output = if token_count >= max_tokens {
-                                LLMEngineOutput::length()
-                            } else {
-                                LLMEngineOutput::stop()
-                            };
+                        if signal.completed && token_count < max_tokens {
+                            let _ = stream_tx.send(LLMEngineOutput::error("Completion signal received before max tokens reached".to_string())).await;
+                            break;
+                        }
 
-                            let _ = stream_tx.send(final_output).await;
+                        if signal.completed {
+                            let _ = stream_tx.send(LLMEngineOutput::length()).await;
                             break;
                         }
 
@@ -426,17 +430,58 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
 }
 
 pub struct AnnotatedMockEngine {
-    inner: MockVllmEngine,
+    inner: Arc<MockVllmEngine>,
 }
 
 impl AnnotatedMockEngine {
-    pub fn new(inner: MockVllmEngine) -> Self {
-        Self { inner }
-    }
+    pub fn new(
+        inner: MockVllmEngine,
+        distributed_runtime: DistributedRuntime,
+        endpoint: dynamo_runtime::protocols::Endpoint,
+    ) -> Self {
+        let inner = Arc::new(inner);
+        let inner_clone = inner.clone();
+
+        // Start background task to wait for component service and start the engine
+        tokio::spawn(async move {
+            loop {
+                // Try to create component
+                let Ok(namespace) = distributed_runtime.namespace(&endpoint.namespace) else {
+                    tracing::debug!("Namespace not available yet, retrying...");
+                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    continue;
+                };
+
+                let Ok(component) = namespace.component(&endpoint.component) else {
+                    tracing::debug!("Component not available yet, retrying...");
+                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    continue;
+                };
+
+                // Check if service is available by trying to list instances
+                let Ok(instances) = component.list_instances().await else {
+                    tracing::debug!("Cannot list instances yet, retrying...");
+                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    continue;
+                };
+
+                if instances.is_empty() {
+                    tracing::debug!("No instances available yet, retrying...");
+                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    continue;
+                }
 
-    pub async fn start(&self, component: Component) -> Result<()> {
-        self.inner.clone().start(component).await?;
-        Ok(())
+                tracing::info!("Component service is now available, starting mocker engine");
+
+                // Start the engine with the component
+                if let Err(e) = inner_clone.start(component).await {
+                    tracing::error!("Failed to start mocker engine: {}", e);
+                }
+                break;
+            }
+        });
+
+        Self { inner }
     }
 }
 
@@ -460,11 +505,16 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
 
 /// Create a mocker engine as ExecutionContext
 pub async fn make_mocker_engine(
+    distributed_runtime: DistributedRuntime,
+    endpoint: dynamo_runtime::protocols::Endpoint,
     args: MockEngineArgs,
 ) -> Result<crate::backend::ExecutionContext, Error> {
-    Ok(Arc::new(AnnotatedMockEngine::new(MockVllmEngine::new(
-        args,
-    ))))
+    // Create the mocker engine
+    tracing::info!("Creating mocker engine (service will be started in background)");
+    let annotated_engine =
+        AnnotatedMockEngine::new(MockVllmEngine::new(args), distributed_runtime, endpoint);
+
+    Ok(Arc::new(annotated_engine))
 }
 
 #[cfg(test)]
@@ -512,7 +562,7 @@ mod integration_tests {
             .build()
             .unwrap();
 
-        let mut engine = MockVllmEngine::new(args);
+        let engine = MockVllmEngine::new(args);
         engine.start(test_component.clone()).await?;
         tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
         let engine = Arc::new(engine);
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 255e2380fb..42aabe3c7b 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -417,6 +417,11 @@ impl Scheduler {
         let _ = self.request_tx.send(request);
     }
 
+    /// Expose the sender
+    pub fn request_sender(&self) -> mpsc::UnboundedSender<DirectRequest> {
+        self.request_tx.clone()
+    }
+
     /// Get the count of waiting requests
     pub async fn waiting_count(&self) -> usize {
         let state = self.state.lock().await;

From a206569b23185b7365657cc201332671bdc8b832 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 17 Jun 2025 02:55:17 -0700
Subject: [PATCH 19/36] actually load extra mocker args in guide

---
 docs/guides/dynamo_run.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index 517e8381bb..be508f9544 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -536,7 +536,7 @@ Available options:
 
 ```bash
 echo '{"speedup_ratio": 10.0}' > mocker_args.json
-dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0
+dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json
 dynamo-run in=http out=dyn --router-mode kv
 ```
 

From d3730ffaf3dc533966f4f32d9ae51bbb94a00bff Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 23 Jun 2025 01:01:17 -0700
Subject: [PATCH 20/36] free blocks if failed to send (receiver dropped)

---
 lib/llm/src/mocker/scheduler.rs | 91 +++++++++++++++++++++++++++------
 1 file changed, 75 insertions(+), 16 deletions(-)

diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 42aabe3c7b..83c49f3de2 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -368,24 +368,19 @@ impl Scheduler {
                                 }
                             }
 
-                            // Send UUID notification for each generated token
-                            if let Some(tx) = &output_tx_clone {
-                                let signal = OutputSignal {
-                                    uuid,
-                                    completed: false,
-                                };
-                                let _ = tx.send(signal);
+                            // Check completion and send notification
+                            let is_complete = sequence.generated_tokens() >= sequence.max_output_tokens();
+                            let send_failed = output_tx_clone.as_ref().is_some_and(|tx| {
+                                tx.send(OutputSignal { uuid, completed: is_complete }).is_err()
+                            });
+
+                            if send_failed {
+                                for signal in &sequence.free_signal() {
+                                    kv_manager_guard.process(signal);
+                                }
                             }
 
-                            // Check if we're done after generating
-                            if sequence.generated_tokens() >= sequence.max_output_tokens() {
-                                if let Some(tx) = &output_tx_clone {
-                                    let signal = OutputSignal {
-                                        uuid,
-                                        completed: true,
-                                    };
-                                    let _ = tx.send(signal);
-                                }
+                            if send_failed || is_complete {
                                 state_guard.complete(&uuid);
                                 continue;
                             }
@@ -753,4 +748,68 @@ mod tests {
         );
         println!("Received {} tokens", received_tokens);
     }
+
+    #[tokio::test]
+    async fn test_receiver_drop_cleans_up_resources() {
+        let block_size: usize = 64;
+        let input_tokens = 256;
+        let max_output_tokens = 200; // More than we'll receive
+
+        // Create channel for token output
+        let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
+
+        // Create scheduler args
+        let args = MockEngineArgs::builder()
+            .num_gpu_blocks(10) // Enough for 256 tokens (4 blocks)
+            .block_size(block_size)
+            .speedup_ratio(100.0) // Fast simulation
+            .build()
+            .unwrap();
+
+        // Create scheduler
+        let scheduler = Scheduler::new(args, None, Some(output_tx), None, None);
+
+        // Create request with 256 tokens
+        let tokens: Vec<u32> = (0..input_tokens).map(|i| i as u32).collect();
+        let request = DirectRequest {
+            tokens,
+            max_output_tokens,
+            uuid: None,
+            dp_rank: None,
+        };
+
+        scheduler.receive(request).await;
+
+        // Receive exactly 129 tokens
+        let mut received_count = 0;
+        while received_count < 129 {
+            if let Some(_signal) = output_rx.recv().await {
+                received_count += 1;
+            } else {
+                panic!("Channel closed before receiving 129 tokens");
+            }
+        }
+
+        // Drop the receiver immediately
+        drop(output_rx);
+
+        // Wait for 1 second to allow cleanup
+        tokio::time::sleep(Duration::from_secs(1)).await;
+
+        // Check forward pass metrics
+        let metrics = scheduler.get_forward_pass_metrics().await;
+
+        assert_eq!(
+            metrics.gpu_cache_usage_perc,
+            0.0,
+            "Expected GPU cache usage to be 0%, got {}%",
+            metrics.gpu_cache_usage_perc * 100.0
+        );
+
+        assert_eq!(
+            metrics.kv_active_blocks, 0,
+            "Expected 0 active blocks, got {}",
+            metrics.kv_active_blocks
+        );
+    }
 }

From 68d822a0b9bafb43879ad7675d54d8e221ef21d6 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Mon, 23 Jun 2025 08:48:31 -0700
Subject: [PATCH 21/36] do not regenereate tokens after pre-emption

---
 lib/llm/src/mocker/scheduler.rs | 16 ++++++++++------
 lib/llm/src/mocker/sequence.rs  | 10 +++++++++-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 83c49f3de2..3d09e87d98 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -173,7 +173,6 @@ impl SchedulerState {
         self.prefill_costs.remove(&uuid);
         eprintln!("Request {} will be preempted", uuid);
 
-        // Extract the ActiveSequence from the Request enum
         // Reset the sequence and get the new sequence and signal
         // Insert the new sequence back into the requests map and add to waiting queue
         let Request::Active(mut active_sequence) = request else {
@@ -370,9 +369,14 @@ impl Scheduler {
 
                             // Check completion and send notification
                             let is_complete = sequence.generated_tokens() >= sequence.max_output_tokens();
-                            let send_failed = output_tx_clone.as_ref().is_some_and(|tx| {
-                                tx.send(OutputSignal { uuid, completed: is_complete }).is_err()
-                            });
+                            let should_output = sequence.generated_tokens() > sequence.already_generated_tokens();
+
+                            let mut send_failed = false;
+                            if should_output {
+                                send_failed = output_tx_clone.as_ref().is_some_and(|tx| {
+                                    tx.send(OutputSignal { uuid, completed: is_complete }).is_err()
+                                });
+                            }
 
                             if send_failed {
                                 for signal in &sequence.free_signal() {
@@ -649,8 +653,8 @@ mod tests {
 
         // Assert that we received the expected number of tokens
         assert!(
-            received_tokens > expected_tokens,
-            "Received {} tokens but expected more than {}",
+            received_tokens == expected_tokens,
+            "Received {} tokens but expected exactly {}",
             received_tokens,
             expected_tokens
         );
diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index 17ef65d2c3..c3e0ed497e 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -58,6 +58,9 @@ pub struct ActiveSequence {
     #[getter(copy)]
     generated_tokens: usize,
 
+    #[getter(copy)]
+    already_generated_tokens: usize,
+
     #[getter(copy)]
     num_input_tokens: usize,
 
@@ -81,6 +84,7 @@ impl ActiveSequence {
             block_size,
             max_output_tokens,
             generated_tokens: 0,
+            already_generated_tokens: 0,
             num_input_tokens,
             creation_signal,
         }
@@ -215,6 +219,7 @@ impl ActiveSequence {
         self.tokens.truncate(self.num_input_tokens).unwrap();
         self.unique_blocks =
             create_unique_blocks_from_sequence(&self.tokens, None, self.block_size);
+        self.already_generated_tokens = self.generated_tokens.max(self.already_generated_tokens);
         self.generated_tokens = 0;
         self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone()));
 
@@ -358,7 +363,7 @@ mod tests {
             seq1.push(token);
         }
 
-        // Push token 47 and get the signal - this completes the block and triggers signals
+        // Push token 48 and get the signal - this completes the block and triggers signals
         let signal = seq1.push(48);
         let signal = signal.unwrap();
 
@@ -382,6 +387,9 @@ mod tests {
         // Reset seq1 and check that it equals the original clone
         let free_signals = seq1.reset_with_signal();
 
+        // 49 - 15 generated tokens
+        assert_eq!(seq1.already_generated_tokens, 34);
+
         // Verify the reset signals include proper cleanup events
         assert!(!free_signals.is_empty());
     }

From d69edcf5dac94f10873a1a43c2912d78ed2a9151 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 20:29:43 -0700
Subject: [PATCH 22/36] evictor cleanup

---
 lib/llm/src/mocker/evictor.rs | 37 ++++++++++++-----------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/lib/llm/src/mocker/evictor.rs b/lib/llm/src/mocker/evictor.rs
index bd1f827ebe..d113adffa1 100644
--- a/lib/llm/src/mocker/evictor.rs
+++ b/lib/llm/src/mocker/evictor.rs
@@ -58,18 +58,14 @@ impl<T: Clone + Eq + Hash> Default for LRUEvictor<T> {
 }
 
 impl<T: Clone + Eq + Hash> LRUEvictor<T> {
-    /// Create a new LRUEvictor
     pub fn new(_cleanup_threshold: usize) -> Self {
-        // Keep the parameter for API compatibility, but ignore it
         Self::default()
     }
 
-    /// Get an iterator over the keys in the evictor
     pub fn keys(&self) -> std::collections::hash_map::Keys<'_, T, i64> {
         self.free_table.keys()
     }
 
-    /// Private helper method to update the data structures with object and counter
     fn _update(&mut self, object: T, counter: i64) {
         self.free_table.insert(object.clone(), counter);
         self.priority_queue.insert(PriorityItem {
@@ -78,7 +74,6 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
         });
     }
 
-    /// Insert or update an object in the evictor with positive counter
     pub fn insert(&mut self, object: T) {
         // Remove old entry if it exists
         if let Some(&old_counter) = self.free_table.get(&object) {
@@ -112,7 +107,6 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
         self._update(object, counter);
     }
 
-    /// Check if the evictor contains the given object
     pub fn contains(&self, object: &T) -> bool {
         self.free_table.contains_key(object)
     }
@@ -120,34 +114,29 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
     /// Evict an object based on LRU policy (lowest counter value)
     /// Returns the evicted object or None if no objects are available
     pub fn evict(&mut self) -> Option<T> {
-        if let Some(item) = self.priority_queue.pop_first() {
+        self.priority_queue.pop_first().map(|item| {
             self.free_table.remove(&item.item);
-            Some(item.item)
-        } else {
-            None
-        }
+            item.item
+        })
     }
 
-    /// Remove an object from the evictor
     pub fn remove(&mut self, object: &T) -> bool {
-        if let Some(&counter) = self.free_table.get(object) {
-            self.free_table.remove(object);
-            self.priority_queue.remove(&PriorityItem {
-                item: object.clone(),
-                counter,
-            });
-            true
-        } else {
-            false
-        }
+        let Some(&counter) = self.free_table.get(object) else {
+            return false;
+        };
+
+        self.free_table.remove(object);
+        self.priority_queue.remove(&PriorityItem {
+            item: object.clone(),
+            counter,
+        });
+        true
     }
 
-    /// Get the number of objects in the evictor
     pub fn len(&self) -> usize {
         self.free_table.len()
     }
 
-    /// Check if the evictor is empty
     pub fn is_empty(&self) -> bool {
         self.free_table.is_empty()
     }

From c08f9eaca7fccfa2356073d5febb4bac18d4ad97 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 21:19:37 -0700
Subject: [PATCH 23/36] only need runtime in dynamic arms

---
 launch/dynamo-run/src/lib.rs     | 27 +++++++++++++++++++++++----
 lib/llm/src/mocker/engine.rs     | 32 ++++++++++++++++----------------
 lib/llm/src/mocker/evictor.rs    |  6 +++---
 lib/llm/src/mocker/kv_manager.rs | 31 +++++++++----------------------
 lib/llm/src/mocker/scheduler.rs  | 12 +++++-------
 5 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 11e70830c1..05a2d9c4a3 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -9,6 +9,7 @@ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, local_mode
 use dynamo_runtime::protocols::Endpoint as EndpointId;
 use dynamo_runtime::slug::Slug;
 use dynamo_runtime::{CancellationToken, DistributedRuntime};
+use tokio::sync::OnceCell;
 
 mod flags;
 pub use flags::Flags;
@@ -58,13 +59,27 @@ pub async fn run(
         anyhow::bail!("Cannot use endpoint for both in and out");
     }
 
-    let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
     let cancel_token = runtime.primary_token();
     let maybe_path = flags
         .model_path_pos
         .clone()
         .or(flags.model_path_flag.clone());
 
+    // Create a OnceCell for lazy initialization of distributed runtime
+    let distributed_runtime_cell: OnceCell<DistributedRuntime> = OnceCell::new();
+    let runtime_clone = runtime.clone();
+
+    // Helper closure to get or initialize the distributed runtime
+    let get_distributed_runtime = || async {
+        distributed_runtime_cell
+            .get_or_init(|| async {
+                DistributedRuntime::from_settings(runtime_clone.clone())
+                    .await
+                    .expect("Failed to create distributed runtime")
+            })
+            .await
+    };
+
     let mut local_model: LocalModel = if is_out_dynamic(&out_opt) {
         // If output is dynamic we are ingress and don't have a local model, but making an
         // empty one cleans up the code.
@@ -324,15 +339,17 @@ pub async fn run(
 
             let args = builder
                 .build()
-                .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {}", e))?;
+                .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {e}"))?;
 
+            // Get or initialize the distributed runtime
+            let distributed_runtime = get_distributed_runtime().await;
             let engine = dynamo_llm::mocker::engine::make_mocker_engine(
                 distributed_runtime.clone(),
                 endpoint,
                 args,
             )
             .await
-            .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {}", e))?;
+            .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {e}"))?;
 
             EngineConfig::StaticCore {
                 engine,
@@ -365,7 +382,9 @@ pub async fn run(
                 .await?;
         }
         Input::Endpoint(path) => {
-            crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
+            // Get or initialize the distributed runtime
+            let distributed_runtime = get_distributed_runtime().await;
+            crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?;
         }
     }
 
diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index d4b8272007..bce5b47bb7 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -188,7 +188,7 @@ impl MockVllmEngine {
                 let publisher = metrics_publisher.clone();
                 async move {
                     if let Err(e) = publisher.create_endpoint(comp.clone()).await {
-                        tracing::error!("Metrics endpoint failed: {}", e);
+                        tracing::error!("Metrics endpoint failed: {e}");
                     }
                 }
             });
@@ -216,13 +216,13 @@ impl MockVllmEngine {
 
                             // Publish metrics
                             if let Err(e) = publisher.publish(Arc::new(metrics)) {
-                                tracing::warn!("Failed to publish metrics for DP rank {}: {}", dp_rank, e);
+                                tracing::warn!("Failed to publish metrics for DP rank {dp_rank}: {e}");
                             } else {
                                 tracing::trace!("Published metrics for DP rank {}", dp_rank);
                             }
                         }
                         _ = cancel_token.cancelled() => {
-                            tracing::info!("Metrics publishing cancelled for DP rank {}", dp_rank);
+                            tracing::info!("Metrics publishing cancelled for DP rank {dp_rank}");
                             break;
                         }
                     }
@@ -256,7 +256,7 @@ impl MockVllmEngine {
             .expect("Cannot publish KV events without lease") // ← This will PANIC on static!
             .id();
         // let worker_id = 0;
-        tracing::debug!("Worker_id set to: {}", worker_id);
+        tracing::debug!("Worker_id set to: {worker_id}");
 
         tracing::info!("Creating KV event publisher");
         let kv_event_publisher = Arc::new(KvEventPublisher::new(
@@ -272,13 +272,13 @@ impl MockVllmEngine {
             kv_event_receivers.len()
         );
         for (dp_rank, mut kv_events_rx) in kv_event_receivers.into_iter().enumerate() {
-            tracing::debug!("Starting background task for DP rank {}", dp_rank);
+            tracing::debug!("Starting background task for DP rank {dp_rank}");
             let publisher = kv_event_publisher.clone();
             let dp_rank = dp_rank as u32;
             let cancel_token = cancel_token.clone();
 
             tokio::spawn(async move {
-                tracing::debug!("Background task started for DP rank {}", dp_rank);
+                tracing::debug!("Background task started for DP rank {dp_rank}");
                 loop {
                     tokio::select! {
                         // Receive actual KV events from the scheduler
@@ -291,13 +291,13 @@ impl MockVllmEngine {
 
                             // Publish the event
                             if let Err(e) = publisher.publish(event) {
-                                tracing::warn!("Failed to publish KV event for DP rank {}: {}", dp_rank, e);
+                                tracing::warn!("Failed to publish KV event for DP rank {dp_rank}: {e}");
                             } else {
-                                tracing::trace!("Published KV event for DP rank {}", dp_rank);
+                                tracing::trace!("Published KV event for DP rank {dp_rank}");
                             }
                         }
                         _ = cancel_token.cancelled() => {
-                            tracing::info!("KV events publishing cancelled for DP rank {}", dp_rank);
+                            tracing::info!("KV events publishing cancelled for DP rank {dp_rank}");
                             break;
                         }
                     }
@@ -475,7 +475,7 @@ impl AnnotatedMockEngine {
 
                 // Start the engine with the component
                 if let Err(e) = inner_clone.start(component).await {
-                    tracing::error!("Failed to start mocker engine: {}", e);
+                    tracing::error!("Failed to start mocker engine: {e}");
                 }
                 break;
             }
@@ -566,7 +566,7 @@ mod integration_tests {
         engine.start(test_component.clone()).await?;
         tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
         let engine = Arc::new(engine);
-        tracing::info!("✓ MockVllmEngine created with DP_SIZE: {}", DP_SIZE);
+        tracing::info!("✓ MockVllmEngine created with DP_SIZE: {DP_SIZE}");
 
         // Set up KV events subscriber
         let mut kv_events_subscriber = test_component.subscribe(KV_EVENT_SUBJECT).await?;
@@ -587,7 +587,7 @@ mod integration_tests {
                     .start()
                     .await
                 {
-                    eprintln!("❌ Generate endpoint failed: {}", e);
+                    eprintln!("❌ Generate endpoint failed: {e}");
                 }
             }
         });
@@ -612,7 +612,7 @@ mod integration_tests {
                 }
             }
             Err(e) => {
-                tracing::error!("❌ Failed to list instances: {}", e);
+                tracing::error!("❌ Failed to list instances: {e}");
             }
         }
 
@@ -638,7 +638,7 @@ mod integration_tests {
             sampling_options: SamplingOptions::default(),
             eos_token_ids: vec![],
             mdc_sum: None,
-            annotations: vec![format!("dp_rank:{}", dp_rank)],
+            annotations: vec![format!("dp_rank:{dp_rank}")],
             estimated_prefix_hit_num_blocks: None,
         };
 
@@ -712,10 +712,10 @@ mod integration_tests {
 
         match serde_json::from_slice::<RouterEvent>(&msg.payload) {
             Ok(event) => {
-                tracing::info!("✓ Received KV event: {:?}", event);
+                tracing::info!("✓ Received KV event: {event:?}");
             }
             Err(e) => {
-                return Err(Error::msg(format!("Failed to deserialize KV event: {}", e)));
+                return Err(Error::msg(format!("Failed to deserialize KV event: {e}")));
             }
         }
 
diff --git a/lib/llm/src/mocker/evictor.rs b/lib/llm/src/mocker/evictor.rs
index d113adffa1..63d079180d 100644
--- a/lib/llm/src/mocker/evictor.rs
+++ b/lib/llm/src/mocker/evictor.rs
@@ -66,7 +66,7 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
         self.free_table.keys()
     }
 
-    fn _update(&mut self, object: T, counter: i64) {
+    fn update(&mut self, object: T, counter: i64) {
         self.free_table.insert(object.clone(), counter);
         self.priority_queue.insert(PriorityItem {
             item: object,
@@ -87,7 +87,7 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
         self.positive_counter += 1;
         let counter = self.positive_counter;
 
-        self._update(object, counter);
+        self.update(object, counter);
     }
 
     /// Push an object to the front with negative counter (highest priority for eviction)
@@ -104,7 +104,7 @@ impl<T: Clone + Eq + Hash> LRUEvictor<T> {
         self.negative_counter -= 1;
         let counter = self.negative_counter;
 
-        self._update(object, counter);
+        self.update(object, counter);
     }
 
     pub fn contains(&self, object: &T) -> bool {
diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index 9d061b8ed0..d28e577c44 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -220,8 +220,7 @@ impl KvManager {
                 let Some(ref_count) = self.active_blocks.remove(&uuid_block) else {
                     let in_all_blocks = self.all_blocks.contains(&uuid_block);
                     panic!(
-                        "Missing active block for promotion: {:?}. Block still exists: {}",
-                        uuid_block, in_all_blocks
+                        "Missing active block for promotion: {uuid_block:?}. Block still exists: {in_all_blocks}"
                     );
                 };
 
@@ -371,7 +370,7 @@ mod tests {
         ) {
             let response = rx
                 .try_recv()
-                .unwrap_or_else(|_| panic!("Expected {} response {}", expected_type, description));
+                .unwrap_or_else(|_| panic!("Expected {expected_type} response {description}"));
 
             match (&response, expected_type) {
                 (MoveBlockResponse::Store(blocks, _parent_hash), "Store") => {
@@ -384,8 +383,7 @@ mod tests {
                     );
                     assert_eq!(
                         *blocks, expected_blocks,
-                        "Store blocks don't match expected {}",
-                        description
+                        "Store blocks don't match expected {description}"
                     );
                 }
                 (MoveBlockResponse::Remove(blocks), "Remove") => {
@@ -398,14 +396,10 @@ mod tests {
                     );
                     assert_eq!(
                         *blocks, expected_blocks,
-                        "Remove blocks don't match expected {}",
-                        description
+                        "Remove blocks don't match expected {description}"
                     );
                 }
-                _ => panic!(
-                    "Expected {} response, got {:?} {}",
-                    expected_type, response, description
-                ),
+                _ => panic!("Expected {expected_type} response, got {response:?} {description}"),
             }
         }
 
@@ -414,11 +408,7 @@ mod tests {
             rx: &mut mpsc::UnboundedReceiver<MoveBlockResponse>,
             description: &str,
         ) {
-            assert!(
-                rx.try_recv().is_err(),
-                "Expected no response {}",
-                description
-            );
+            assert!(rx.try_recv().is_err(), "Expected no response {description}",);
         }
 
         // Helper function to check if active blocks contain expected blocks with expected ref counts
@@ -433,14 +423,12 @@ mod tests {
                 let block = UniqueBlock::FullBlock(id);
                 assert!(
                     manager.active_blocks().contains_key(&block),
-                    "Block {} not found in active blocks",
-                    id
+                    "Block {id} not found in active blocks",
                 );
                 assert_eq!(
                     manager.active_blocks().get(&block),
                     Some(&ref_count),
-                    "Block {} has wrong reference count",
-                    id
+                    "Block {id} has wrong reference count",
                 );
             }
         }
@@ -463,8 +451,7 @@ mod tests {
                 let block = UniqueBlock::FullBlock(id);
                 assert!(
                     inactive_blocks.iter().any(|&b| *b == block),
-                    "Block {} not found in inactive blocks",
-                    id
+                    "Block {id} not found in inactive blocks",
                 );
             }
         }
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 3d09e87d98..2d7f73fe35 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -171,7 +171,7 @@ impl SchedulerState {
             .remove(&uuid)
             .expect("Request does not exist.");
         self.prefill_costs.remove(&uuid);
-        eprintln!("Request {} will be preempted", uuid);
+        eprintln!("Request {uuid} will be preempted");
 
         // Reset the sequence and get the new sequence and signal
         // Insert the new sequence back into the requests map and add to waiting queue
@@ -623,7 +623,7 @@ mod tests {
                 // Manual debug ticker that prints forward pass metrics
                 _ = debug_interval.tick() => {
                     let _metrics = scheduler.get_forward_pass_metrics().await;
-                    println!("Forward Pass Metrics: {:#?}", _metrics);
+                    println!("Forward Pass Metrics: {_metrics:#?}");
                 }
 
                 Some(_) = output_rx.recv() => {
@@ -654,9 +654,7 @@ mod tests {
         // Assert that we received the expected number of tokens
         assert!(
             received_tokens == expected_tokens,
-            "Received {} tokens but expected exactly {}",
-            received_tokens,
-            expected_tokens
+            "Received {received_tokens} tokens but expected exactly {expected_tokens}"
         );
     }
 
@@ -715,7 +713,7 @@ mod tests {
                 // Manual debug ticker that prints forward pass metrics
                 _ = debug_interval.tick() => {
                     let _metrics = scheduler.get_forward_pass_metrics().await;
-                    println!("Forward Pass Metrics: {:#?}", _metrics);
+                    println!("Forward Pass Metrics: {_metrics:#?}");
                 }
 
                 Some(_signal) = output_rx.recv() => {
@@ -750,7 +748,7 @@ mod tests {
             "Test passed! Cache hit rate: {:.3}",
             metrics.gpu_prefix_cache_hit_rate
         );
-        println!("Received {} tokens", received_tokens);
+        println!("Received {received_tokens} tokens");
     }
 
     #[tokio::test]

From dee1413670782d07d0a864fadfe05e6c49b917c5 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 21:38:41 -0700
Subject: [PATCH 24/36] no separate extra-mocker-args

---
 docs/guides/dynamo_run.md      | 12 +++++-------
 launch/dynamo-run/src/flags.rs | 20 --------------------
 launch/dynamo-run/src/lib.rs   | 20 +++++++++++++-------
 3 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index be508f9544..76e7a3e2b4 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--extra-mocker-args=args_mocker.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
@@ -525,18 +525,16 @@ The mocker engine is a mock vLLM implementation designed for testing and develop
 
 **Basic usage:**
 
-The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights.
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine.
 
-Available options:
+And below are arguments that are mocker-specific:
 - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
 - `dp_size`: Number of data parallel workers to simulate (default: 1)
-- `num_gpu_blocks`: Number of GPU blocks to simulate for the KV cache (default: 16384). This is normally calculated automatically by the real vllm engine based on the VRAM size and model KV cache size.
-- `max_num_batched_tokens`: Maximum number of tokens that can be batched together (default: 8192)
-- `watermark`: KV cache watermark threshold as a fraction (default: 0.01)
+- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
 
 ```bash
 echo '{"speedup_ratio": 10.0}' > mocker_args.json
-dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-mocker-args mocker_args.json
+dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
 dynamo-run in=http out=dyn --router-mode kv
 ```
 
diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
index 961fb2f800..4b9f4d0a30 100644
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -151,18 +151,6 @@ pub struct Flags {
     /// These are the command line arguments to the python engine when using `pystr` or `pytok`.
     #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
     pub last: Vec<String>,
-
-    /// Mocker engine configuration from a JSON file.
-    /// Example file contents:
-    /// {
-    ///     "speedup_ratio": 1.0,
-    ///     "dp_size": 1,
-    ///     "num_gpu_blocks": 16384,
-    ///     "max_num_batched_tokens": 8192,
-    ///     "watermark": 0.01
-    /// }
-    #[arg(long)]
-    pub extra_mocker_args: Option<PathBuf>,
 }
 
 impl Flags {
@@ -249,14 +237,6 @@ impl Flags {
     ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
         Self::load_json_args(&self.extra_engine_args)
     }
-
-    /// Load extra mocker arguments from a JSON file
-    /// Returns a HashMap of parameter names to values
-    pub fn load_extra_mocker_args(
-        &self,
-    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
-        Self::load_json_args(&self.extra_mocker_args)
-    }
 }
 
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 05a2d9c4a3..d62d11201c 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -309,7 +309,7 @@ pub async fn run(
             };
 
             // Load mocker args from JSON file if provided
-            let mocker_args = flags.load_extra_mocker_args()?;
+            let engine_args = flags.load_extra_engine_args()?;
 
             let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder();
 
@@ -319,12 +319,10 @@ pub async fn run(
             }
 
             // Apply args from JSON file if provided
-            if let Some(args) = mocker_args {
-                if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) {
-                    builder = builder.speedup_ratio(v);
-                }
-                if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) {
-                    builder = builder.dp_size(v as u32);
+            if let Some(args) = engine_args {
+                // This overwrites the kv_cache_block_size passed in
+                if let Some(v) = args.get("block_size").and_then(|v| v.as_u64()) {
+                    builder = builder.block_size(v as usize);
                 }
                 if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) {
                     builder = builder.num_gpu_blocks(v as usize);
@@ -332,9 +330,17 @@ pub async fn run(
                 if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) {
                     builder = builder.max_num_batched_tokens(Some(v as usize));
                 }
+
+                // These are mocker-specific args
+                if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) {
+                    builder = builder.speedup_ratio(v);
+                }
                 if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) {
                     builder = builder.watermark(v);
                 }
+                if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) {
+                    builder = builder.dp_size(v as u32);
+                }
             }
 
             let args = builder

From 99fd3f2a52c85586b34cb9e6203937bbf3343d64 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 21:44:24 -0700
Subject: [PATCH 25/36] update to match batched tokens

---
 lib/llm/src/mocker/engine.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index bce5b47bb7..1fafc1322a 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -404,6 +404,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
                             cum_log_probs: None,
                             log_probs: None,
                             finish_reason: None,
+                            index: None,
                         };
 
                         if stream_tx.send(output).await.is_err() {
@@ -631,6 +632,7 @@ mod integration_tests {
         // Create test requests for both DP workers
         let create_request = |tokens: Vec<TokenIdType>, dp_rank: u32| PreprocessedRequest {
             token_ids: tokens,
+            batch_token_ids: None,
             stop_conditions: StopConditions {
                 max_tokens: Some(TOKENS_PER_REQUEST as u32),
                 ..Default::default()

From 85c7ccfb108539a955b007f87718de5c28efc8fd Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 21:58:05 -0700
Subject: [PATCH 26/36] max-num-seqs

---
 docs/guides/dynamo_run.md       |  2 +-
 launch/dynamo-run/src/lib.rs    |  3 +++
 lib/llm/src/mocker/protocols.rs |  3 +++
 lib/llm/src/mocker/scheduler.rs | 15 ++++++++++++---
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index 76e7a3e2b4..5a231e9f89 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -525,7 +525,7 @@ The mocker engine is a mock vLLM implementation designed for testing and develop
 
 **Basic usage:**
 
-The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine.
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine.
 
 And below are arguments that are mocker-specific:
 - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index d62d11201c..4b620a4f56 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -327,6 +327,9 @@ pub async fn run(
                 if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) {
                     builder = builder.num_gpu_blocks(v as usize);
                 }
+                if let Some(v) = args.get("max_num_seqs").and_then(|v| v.as_u64()) {
+                    builder = builder.max_num_seqs(Some(v as usize));
+                }
                 if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) {
                     builder = builder.max_num_batched_tokens(Some(v as usize));
                 }
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
index 7fd8895594..2b759eefe6 100644
--- a/lib/llm/src/mocker/protocols.rs
+++ b/lib/llm/src/mocker/protocols.rs
@@ -91,6 +91,9 @@ pub struct MockEngineArgs {
     #[builder(default = "64")]
     pub block_size: usize,
 
+    #[builder(default = None)]
+    pub max_num_seqs: Option<usize>,
+
     // default for open api server, for llm class it's 16384
     #[builder(default = Some(8192))]
     pub max_num_batched_tokens: Option<usize>,
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 2d7f73fe35..4457725bd1 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -138,6 +138,10 @@ impl SchedulerState {
         Some(sequence)
     }
 
+    fn num_active_requests(&self) -> usize {
+        self.prefill.len() + self.decode.len()
+    }
+
     /// Calculate the current running batched tokens
     fn num_batched_tokens(&self) -> usize {
         self.prefill_costs
@@ -272,23 +276,28 @@ impl Scheduler {
                         // schedule anymore.
                         let mut current_blocks = kv_manager_guard.num_active_blocks();
                         let mut current_tokens = state_guard.num_batched_tokens();
+                        let mut current_seqs = state_guard.num_active_requests();
+
                         while let Some((uuid, request)) = state_guard.next() {
                             let active_sequence = get_active_sequence(request, args.block_size);
 
                             // Update predictive budgets
                             let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence);
-                            let new_tokens = active_sequence.len();
-                            let new_blocks = (new_tokens + 1) / args.block_size;  // this is conservative, assumes no cache hit
+                            let total_tokens = active_sequence.len();
+                            let new_blocks = (total_tokens + 1) / args.block_size;  // this is conservative, assumes no cache hit
                             let new_tokens = prefill_cost.new_tokens;
+
                             current_blocks += new_blocks;
                             current_tokens += new_tokens;
+                            current_seqs += 1;
 
                             // Check if it can be scheduled
                             let under_block_budget = current_blocks as f64 <= (1. - args.watermark) * kv_manager_guard.max_capacity() as f64;
                             let under_token_budget = args.max_num_batched_tokens.is_none_or(|limit| current_tokens <= limit);
+                            let under_seq_budget = args.max_num_seqs.is_none_or(|limit| current_seqs <= limit);
 
                             // Cannot schedule, put first in line instead
-                            if !(under_block_budget && under_token_budget) {
+                            if !(under_block_budget && under_token_budget && under_seq_budget) {
                                 state_guard.first_in_line(uuid, Request::Active(active_sequence));
                                 break;
                             }

From ec1f360851ce35eb005730966fa089dd955ff485 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 22:22:51 -0700
Subject: [PATCH 27/36] enable_prefix_caching arg

---
 docs/guides/dynamo_run.md       |  2 +-
 launch/dynamo-run/src/lib.rs    |  3 +++
 lib/llm/src/mocker/protocols.rs |  6 ++++-
 lib/llm/src/mocker/scheduler.rs | 28 +++++++++++++------
 lib/llm/src/mocker/sequence.rs  | 48 +++++++++++++++++++++++++--------
 5 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index 5a231e9f89..952a77aa2d 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -525,7 +525,7 @@ The mocker engine is a mock vLLM implementation designed for testing and develop
 
 **Basic usage:**
 
-The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `block-size` are common arguments shared with the real VLLM engine.
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
 
 And below are arguments that are mocker-specific:
 - `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 4b620a4f56..4de3fc3358 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -333,6 +333,9 @@ pub async fn run(
                 if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) {
                     builder = builder.max_num_batched_tokens(Some(v as usize));
                 }
+                if let Some(v) = args.get("enable_prefix_caching").and_then(|v| v.as_bool()) {
+                    builder = builder.enable_prefix_caching(v);
+                }
 
                 // These are mocker-specific args
                 if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) {
diff --git a/lib/llm/src/mocker/protocols.rs b/lib/llm/src/mocker/protocols.rs
index 2b759eefe6..880b97495c 100644
--- a/lib/llm/src/mocker/protocols.rs
+++ b/lib/llm/src/mocker/protocols.rs
@@ -91,13 +91,17 @@ pub struct MockEngineArgs {
     #[builder(default = "64")]
     pub block_size: usize,
 
-    #[builder(default = None)]
+    // This was 1024 in the past but reverted back to 256
+    #[builder(default = Some(256))]
     pub max_num_seqs: Option<usize>,
 
     // default for open api server, for llm class it's 16384
     #[builder(default = Some(8192))]
     pub max_num_batched_tokens: Option<usize>,
 
+    #[builder(default = true)]
+    pub enable_prefix_caching: bool,
+
     #[builder(default = "0.01")]
     pub watermark: f64,
 
diff --git a/lib/llm/src/mocker/scheduler.rs b/lib/llm/src/mocker/scheduler.rs
index 4457725bd1..0223b04d5e 100644
--- a/lib/llm/src/mocker/scheduler.rs
+++ b/lib/llm/src/mocker/scheduler.rs
@@ -279,7 +279,7 @@ impl Scheduler {
                         let mut current_seqs = state_guard.num_active_requests();
 
                         while let Some((uuid, request)) = state_guard.next() {
-                            let active_sequence = get_active_sequence(request, args.block_size);
+                            let active_sequence = get_active_sequence(request, args.block_size, args.enable_prefix_caching);
 
                             // Update predictive budgets
                             let prefill_cost = kv_manager_guard.get_prefill_cost(&active_sequence);
@@ -492,7 +492,11 @@ impl Scheduler {
 }
 
 /// Convert a Request to an ActiveSequence
-fn get_active_sequence(request: Request, block_size: usize) -> ActiveSequence {
+fn get_active_sequence(
+    request: Request,
+    block_size: usize,
+    enable_prefix_caching: bool,
+) -> ActiveSequence {
     if let Request::Active(active_seq) = request {
         return active_seq;
     }
@@ -505,6 +509,7 @@ fn get_active_sequence(request: Request, block_size: usize) -> ActiveSequence {
         direct_request.tokens,
         direct_request.max_output_tokens,
         Some(block_size),
+        enable_prefix_caching,
     )
 }
 
@@ -552,10 +557,15 @@ mod tests {
     use std::time::Duration;
 
     #[rstest]
-    #[case::random(false)]
-    #[case::caching(true)]
+    #[case::random_no_prefix_caching(false, false)]
+    #[case::random_with_prefix_caching(false, true)]
+    #[case::caching_no_prefix_caching(true, false)]
+    #[case::caching_with_prefix_caching(true, true)]
     #[tokio::test]
-    async fn test_scheduler_token_generation_patterns(#[case] use_shared_tokens: bool) {
+    async fn test_scheduler_token_generation_patterns(
+        #[case] use_shared_tokens: bool,
+        #[case] enable_prefix_caching: bool,
+    ) {
         std::env::set_var("RUST_LOG", "debug");
 
         let kv_capacity: usize = 500;
@@ -567,11 +577,12 @@ mod tests {
         // Create channel for token output
         let (output_tx, mut output_rx) = mpsc::unbounded_channel::<OutputSignal>();
 
-        // Create scheduler args using builder
+        // Create scheduler args using builder - now including enable_prefix_caching
         let args = MockEngineArgs::builder()
             .num_gpu_blocks(kv_capacity)
             .block_size(block_size)
             .speedup_ratio(10.0)
+            .enable_prefix_caching(enable_prefix_caching)
             .build()
             .unwrap();
 
@@ -651,13 +662,14 @@ mod tests {
         // Calculate and print elapsed time
         let elapsed = start_time.elapsed();
         println!(
-            "Test completed in: {:?} for {} case",
+            "Test completed in: {:?} for {} case with prefix_caching={}",
             elapsed,
             if use_shared_tokens {
                 "caching"
             } else {
                 "random"
-            }
+            },
+            enable_prefix_caching
         );
 
         // Assert that we received the expected number of tokens
diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index c3e0ed497e..67cc12475e 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -24,11 +24,18 @@ fn create_unique_blocks_from_sequence(
     tokens: &TokenBlockSequence,
     uuid: Option<uuid::Uuid>,
     block_size: usize,
+    enable_prefix_caching: bool,
 ) -> Vec<UniqueBlock> {
     let mut unique_blocks: Vec<UniqueBlock> = tokens
         .blocks()
         .iter()
-        .map(|block| UniqueBlock::FullBlock(block.sequence_hash()))
+        .map(|block| {
+            if enable_prefix_caching {
+                UniqueBlock::FullBlock(block.sequence_hash())
+            } else {
+                UniqueBlock::FullBlock(random::<u64>())
+            }
+        })
         .collect();
 
     // Only push the partial block if tokens count isn't a multiple of block_size
@@ -65,17 +72,26 @@ pub struct ActiveSequence {
     num_input_tokens: usize,
 
     creation_signal: Option<MoveBlock>,
+
+    #[getter(copy)]
+    enable_prefix_caching: bool,
 }
 
 impl ActiveSequence {
     /// Create a new ActiveSequence instance with the provided tokens
-    pub fn new(tokens: Vec<u32>, max_output_tokens: usize, block_size: Option<usize>) -> Self {
+    pub fn new(
+        tokens: Vec<u32>,
+        max_output_tokens: usize,
+        block_size: Option<usize>,
+        enable_prefix_caching: bool,
+    ) -> Self {
         let block_size = block_size.unwrap_or(64);
         assert!(block_size > 1, "block_size must be greater than 1");
         let num_input_tokens = tokens.len();
 
         let tokens = Tokens::from(tokens).into_sequence(block_size, None);
-        let unique_blocks = create_unique_blocks_from_sequence(&tokens, None, block_size);
+        let unique_blocks =
+            create_unique_blocks_from_sequence(&tokens, None, block_size, enable_prefix_caching);
         let creation_signal = Some(MoveBlock::Use(unique_blocks.clone()));
 
         Self {
@@ -87,6 +103,7 @@ impl ActiveSequence {
             already_generated_tokens: 0,
             num_input_tokens,
             creation_signal,
+            enable_prefix_caching,
         }
     }
 
@@ -107,8 +124,9 @@ impl ActiveSequence {
         tokens: Vec<u32>,
         max_output_tokens: usize,
         block_size: Option<usize>,
+        enable_prefix_caching: bool,
     ) -> (Self, Option<MoveBlock>) {
-        let mut sequence = Self::new(tokens, max_output_tokens, block_size);
+        let mut sequence = Self::new(tokens, max_output_tokens, block_size, enable_prefix_caching);
         let signal = sequence.creation_signal.take();
         (sequence, signal)
     }
@@ -139,7 +157,11 @@ impl ActiveSequence {
 
         // Replace last partial block with full block if it exists
         if let Some(UniqueBlock::PartialBlock(uuid)) = self.unique_blocks.last().cloned() {
-            let last_block_hash = self.tokens.last_complete_block().unwrap().sequence_hash();
+            let last_block_hash = if self.enable_prefix_caching {
+                self.tokens.last_complete_block().unwrap().sequence_hash()
+            } else {
+                random::<u64>()
+            };
             self.unique_blocks.pop();
             self.unique_blocks
                 .push(UniqueBlock::FullBlock(last_block_hash));
@@ -212,13 +234,16 @@ impl ActiveSequence {
     }
 
     /// Reset the sequence to its initial state and return the free signals from freeing current blocks
-    /// maintaining the uuid of the last partial block
     pub fn reset_with_signal(&mut self) -> Vec<MoveBlock> {
         let free_signal = self.free_signal();
 
         self.tokens.truncate(self.num_input_tokens).unwrap();
-        self.unique_blocks =
-            create_unique_blocks_from_sequence(&self.tokens, None, self.block_size);
+        self.unique_blocks = create_unique_blocks_from_sequence(
+            &self.tokens,
+            None,
+            self.block_size,
+            self.enable_prefix_caching,
+        );
         self.already_generated_tokens = self.generated_tokens.max(self.already_generated_tokens);
         self.generated_tokens = 0;
         self.creation_signal = Some(MoveBlock::Use(self.unique_blocks.clone()));
@@ -246,7 +271,8 @@ mod tests {
     fn test_active_sequence_push() {
         // Create a sequence with block size 16 initialized with tokens [0..15]
         let initial_tokens: Vec<u32> = (0..15).collect();
-        let (mut seq1, signal1) = ActiveSequence::new_with_signal(initial_tokens, 100, Some(16));
+        let (mut seq1, signal1) =
+            ActiveSequence::new_with_signal(initial_tokens, 100, Some(16), true);
         assert_eq!(seq1.num_input_tokens(), 15);
         assert_eq!(seq1.len(), 15);
 
@@ -296,7 +322,7 @@ mod tests {
 
         // Create another sequence with block size 16 initialized with tokens [0..17]
         let extended_tokens: Vec<u32> = (0..16).collect();
-        let (mut seq2, _) = ActiveSequence::new_with_signal(extended_tokens, 100, Some(16));
+        let (mut seq2, _) = ActiveSequence::new_with_signal(extended_tokens, 100, Some(16), true);
         seq2.push(16);
         seq2.pop();
         seq2.push(16);
@@ -398,7 +424,7 @@ mod tests {
     fn test_active_sequence_generate_signals() {
         // Create a sequence with block size 16, max_output_tokens 4, initialized with tokens [0..14)
         let initial_tokens: Vec<u32> = (0..14).collect();
-        let (mut seq, signal) = ActiveSequence::new_with_signal(initial_tokens, 5, Some(16));
+        let (mut seq, signal) = ActiveSequence::new_with_signal(initial_tokens, 5, Some(16), true);
 
         // Initial signal - should have received a Use signal for the partial block
         assert!(signal.is_some());

From 94abc0dea8cc157f15c0a757ddac93681726901f Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 22:25:13 -0700
Subject: [PATCH 28/36] only publish kv events if enable_prefix_caching set
 true

---
 lib/llm/src/mocker/engine.rs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 1fafc1322a..068625f32e 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -86,13 +86,15 @@ impl MockVllmEngine {
             .await?;
 
         // Start KV events publishing with the actual receivers from schedulers
-        Self::start_kv_events_publishing(
-            kv_event_receiver,
-            Some(component.clone()),
-            self.engine_args.block_size,
-            cancel_token.clone(),
-        )
-        .await?;
+        if self.engine_args.enable_prefix_caching {
+            Self::start_kv_events_publishing(
+                kv_event_receiver,
+                Some(component.clone()),
+                self.engine_args.block_size,
+                cancel_token.clone(),
+            )
+            .await?;
+        }
 
         Ok(())
     }

From 35da284b03d44ad9eff64bb7b3e35b68a095b0af Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Sun, 29 Jun 2025 22:39:05 -0700
Subject: [PATCH 29/36] small note on chunked prefill being false for now

---
 docs/guides/dynamo_run.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index 952a77aa2d..472099c568 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -532,6 +532,9 @@ And below are arguments that are mocker-specific:
 - `dp_size`: Number of data parallel workers to simulate (default: 1)
 - `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
 
+>[!NOTE]
+>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
+
 ```bash
 echo '{"speedup_ratio": 10.0}' > mocker_args.json
 dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json

From c7c072d7c915b404b35564a884eb10f0f48a8eb5 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 10:41:40 -0700
Subject: [PATCH 30/36] revert flags

---
 launch/dynamo-run/src/flags.rs | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
index 4b9f4d0a30..2ac7286302 100644
--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -216,12 +216,12 @@ impl Flags {
         out
     }
 
-    /// Load extra arguments from a JSON file
+    /// Load extra engine arguments from a JSON file
     /// Returns a HashMap of parameter names to values
-    fn load_json_args(
-        path: &Option<PathBuf>,
+    pub fn load_extra_engine_args(
+        &self,
     ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
-        if let Some(path) = path {
+        if let Some(path) = &self.extra_engine_args {
             let file_content = std::fs::read_to_string(path)?;
             let args: HashMap<String, serde_json::Value> = serde_json::from_str(&file_content)?;
             Ok(Some(args))
@@ -229,14 +229,6 @@ impl Flags {
             Ok(None)
         }
     }
-
-    /// Load extra engine arguments from a JSON file
-    /// Returns a HashMap of parameter names to values
-    pub fn load_extra_engine_args(
-        &self,
-    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
-        Self::load_json_args(&self.extra_engine_args)
-    }
 }
 
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]

From de54247b85779b710fc2c7fd057ee8b507a4b845 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 10:46:21 -0700
Subject: [PATCH 31/36] revert dynamo-run changes

---
 docs/guides/dynamo_run.md    | 29 +-----------
 launch/dynamo-run/src/lib.rs | 85 +-----------------------------------
 launch/dynamo-run/src/opt.rs | 12 +----
 3 files changed, 3 insertions(+), 123 deletions(-)

diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
index 472099c568..0ed572a6d6 100644
--- a/docs/guides/dynamo_run.md
+++ b/docs/guides/dynamo_run.md
@@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
@@ -514,33 +514,6 @@ The output looks like this:
 {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
 ```
 
-#### Mocker engine
-
-The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:
-
-- Testing distributed system components without GPU resources
-- Benchmarking infrastructure and networking overhead
-- Developing and debugging Dynamo components
-- Load testing and performance analysis
-
-**Basic usage:**
-
-The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
-
-And below are arguments that are mocker-specific:
-- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
-- `dp_size`: Number of data parallel workers to simulate (default: 1)
-- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
-
->[!NOTE]
->Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
-
-```bash
-echo '{"speedup_ratio": 10.0}' > mocker_args.json
-dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
-dynamo-run in=http out=dyn --router-mode kv
-```
-
 ### Extra engine arguments
 The vllm and sglang backends support passing any argument the engine accepts.
 Put the arguments in a JSON file:
diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 4de3fc3358..d6ec1c9322 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -9,7 +9,6 @@ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, local_mode
 use dynamo_runtime::protocols::Endpoint as EndpointId;
 use dynamo_runtime::slug::Slug;
 use dynamo_runtime::{CancellationToken, DistributedRuntime};
-use tokio::sync::OnceCell;
 
 mod flags;
 pub use flags::Flags;
@@ -65,21 +64,6 @@ pub async fn run(
         .clone()
         .or(flags.model_path_flag.clone());
 
-    // Create a OnceCell for lazy initialization of distributed runtime
-    let distributed_runtime_cell: OnceCell<DistributedRuntime> = OnceCell::new();
-    let runtime_clone = runtime.clone();
-
-    // Helper closure to get or initialize the distributed runtime
-    let get_distributed_runtime = || async {
-        distributed_runtime_cell
-            .get_or_init(|| async {
-                DistributedRuntime::from_settings(runtime_clone.clone())
-                    .await
-                    .expect("Failed to create distributed runtime")
-            })
-            .await
-    };
-
     let mut local_model: LocalModel = if is_out_dynamic(&out_opt) {
         // If output is dynamic we are ingress and don't have a local model, but making an
         // empty one cleans up the code.
@@ -301,73 +285,6 @@ pub async fn run(
                 model: Box::new(local_model),
             }
         }
-
-        Output::Mocker => {
-            let endpoint = match &in_opt {
-                Input::Endpoint(path) => path.parse()?,
-                _ => internal_endpoint("mocker"),
-            };
-
-            // Load mocker args from JSON file if provided
-            let engine_args = flags.load_extra_engine_args()?;
-
-            let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder();
-
-            // Use kv_cache_block_size flag as block_size if provided
-            if let Some(block_size) = flags.kv_cache_block_size {
-                builder = builder.block_size(block_size);
-            }
-
-            // Apply args from JSON file if provided
-            if let Some(args) = engine_args {
-                // This overwrites the kv_cache_block_size passed in
-                if let Some(v) = args.get("block_size").and_then(|v| v.as_u64()) {
-                    builder = builder.block_size(v as usize);
-                }
-                if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) {
-                    builder = builder.num_gpu_blocks(v as usize);
-                }
-                if let Some(v) = args.get("max_num_seqs").and_then(|v| v.as_u64()) {
-                    builder = builder.max_num_seqs(Some(v as usize));
-                }
-                if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) {
-                    builder = builder.max_num_batched_tokens(Some(v as usize));
-                }
-                if let Some(v) = args.get("enable_prefix_caching").and_then(|v| v.as_bool()) {
-                    builder = builder.enable_prefix_caching(v);
-                }
-
-                // These are mocker-specific args
-                if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) {
-                    builder = builder.speedup_ratio(v);
-                }
-                if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) {
-                    builder = builder.watermark(v);
-                }
-                if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) {
-                    builder = builder.dp_size(v as u32);
-                }
-            }
-
-            let args = builder
-                .build()
-                .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {e}"))?;
-
-            // Get or initialize the distributed runtime
-            let distributed_runtime = get_distributed_runtime().await;
-            let engine = dynamo_llm::mocker::engine::make_mocker_engine(
-                distributed_runtime.clone(),
-                endpoint,
-                args,
-            )
-            .await
-            .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {e}"))?;
-
-            EngineConfig::StaticCore {
-                engine,
-                model: Box::new(local_model),
-            }
-        }
     };
 
     match in_opt {
@@ -395,7 +312,7 @@ pub async fn run(
         }
         Input::Endpoint(path) => {
             // Get or initialize the distributed runtime
-            let distributed_runtime = get_distributed_runtime().await;
+            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
             crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?;
         }
     }
diff --git a/launch/dynamo-run/src/opt.rs b/launch/dynamo-run/src/opt.rs
index 3e0d708206..25ab953eb8 100644
--- a/launch/dynamo-run/src/opt.rs
+++ b/launch/dynamo-run/src/opt.rs
@@ -90,9 +90,6 @@ pub enum Output {
     /// Listen for models on nats/etcd, add/remove dynamically
     Dynamic,
 
-    /// Mock vLLM engine for testing and development
-    Mocker,
-
     #[cfg(feature = "mistralrs")]
     /// Run inference on a model in a GGUF file using mistralrs w/ candle
     MistralRs,
@@ -129,7 +126,6 @@ impl TryFrom<&str> for Output {
 
             "echo_full" => Ok(Output::EchoFull),
             "echo_core" => Ok(Output::EchoCore),
-            "mocker" => Ok(Output::Mocker),
 
             "dyn" => Ok(Output::Dynamic),
 
@@ -164,8 +160,6 @@ impl fmt::Display for Output {
             Output::EchoCore => "echo_core",
 
             Output::Dynamic => "dyn",
-
-            Output::Mocker => "mocker",
         };
         write!(f, "{s}")
     }
@@ -174,11 +168,7 @@ impl fmt::Display for Output {
 impl Output {
     #[allow(unused_mut)]
     pub fn available_engines() -> Vec<String> {
-        let mut out = vec![
-            "echo_core".to_string(),
-            "echo_full".to_string(),
-            "mocker".to_string(),
-        ];
+        let mut out = vec!["echo_core".to_string(), "echo_full".to_string()];
         #[cfg(feature = "mistralrs")]
         {
             out.push(Output::MistralRs.to_string());

From 81c12aab7605cd04c537d8a684413a783cdda6f2 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 10:47:35 -0700
Subject: [PATCH 32/36] tiny reversion

---
 launch/dynamo-run/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index d6ec1c9322..529b597d83 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -313,7 +313,7 @@ pub async fn run(
         Input::Endpoint(path) => {
             // Get or initialize the distributed runtime
             let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-            crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?;
+            crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
         }
     }
 

From b959df4637dbd3f228d9d605079242f6da375b4b Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 10:48:15 -0700
Subject: [PATCH 33/36] another reversion

---
 launch/dynamo-run/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
index 529b597d83..82f0841a28 100644
--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -311,7 +311,6 @@ pub async fn run(
                 .await?;
         }
         Input::Endpoint(path) => {
-            // Get or initialize the distributed runtime
             let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
             crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
         }

From b15070a2e53f1a885a5b5e820f54a310cab7f0a5 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 10:59:07 -0700
Subject: [PATCH 34/36] usize reversion

---
 lib/llm/src/mocker/engine.rs     | 2 +-
 lib/llm/src/mocker/kv_manager.rs | 2 +-
 lib/llm/src/mocker/sequence.rs   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/llm/src/mocker/engine.rs b/lib/llm/src/mocker/engine.rs
index 068625f32e..b367910792 100644
--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -264,7 +264,7 @@ impl MockVllmEngine {
         let kv_event_publisher = Arc::new(KvEventPublisher::new(
             comp.clone(),
             worker_id,
-            block_size,
+            block_size as u32,
             None,
         )?);
         tracing::info!("KV event publisher created");
diff --git a/lib/llm/src/mocker/kv_manager.rs b/lib/llm/src/mocker/kv_manager.rs
index c2e7f8bb20..d28e577c44 100644
--- a/lib/llm/src/mocker/kv_manager.rs
+++ b/lib/llm/src/mocker/kv_manager.rs
@@ -58,7 +58,7 @@ pub struct KvManager {
     max_capacity: usize,
 
     #[getter(copy)]
-    block_size: u32,
+    block_size: usize,
 
     active_blocks: HashMap<UniqueBlock, usize>,
 
diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index db8178ef1f..1c5c9fdacf 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -89,7 +89,7 @@ impl ActiveSequence {
         assert!(block_size > 1, "block_size must be greater than 1");
         let num_input_tokens = tokens.len();
 
-        let tokens = Tokens::from(tokens).into_sequence(block_size, None);
+        let tokens = Tokens::from(tokens).into_sequence(block_size as u32, None);
         let unique_blocks =
             create_unique_blocks_from_sequence(&tokens, None, block_size, enable_prefix_caching);
         let creation_signal = Some(MoveBlock::Use(unique_blocks.clone()));

From 3a20b9dd05f555ba8a4f45aba668794581c47567 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 11:23:33 -0700
Subject: [PATCH 35/36] clippy

---
 lib/llm/src/mocker/sequence.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index 1c5c9fdacf..e0fb59a417 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -39,7 +39,7 @@ fn create_unique_blocks_from_sequence(
         .collect();
 
     // Only push the partial block if tokens count isn't a multiple of block_size
-    if tokens.total_tokens() % (block_size as usize) != 0 {
+    if tokens.total_tokens() % block_size != 0 {
         unique_blocks.push(match uuid {
             Some(uuid) => UniqueBlock::PartialBlock(uuid),
             None => UniqueBlock::default(),
@@ -108,7 +108,7 @@ impl ActiveSequence {
     }
 
     pub fn extra_tokens(&self) -> u32 {
-        (self.len() % self.block_size as usize) as u32
+        (self.len() % self.block_size) as u32
     }
 
     pub fn len(&self) -> usize {
@@ -147,7 +147,7 @@ impl ActiveSequence {
         self.tokens.append(token).expect("Token push failed.");
         self.generated_tokens += 1;
 
-        if self.len() % (self.block_size as usize) != 1 {
+        if self.len() % self.block_size != 1 {
             return None;
         }
 
@@ -257,7 +257,7 @@ impl ActiveSequence {
         self.generated_tokens = self.generated_tokens.saturating_sub(1);
 
         // Reverts to the last full block
-        if self.tokens.total_tokens() % (self.block_size as usize) == 0 {
+        if self.tokens.total_tokens() % self.block_size == 0 {
             self.unique_blocks.pop();
         }
     }

From c74760613b12927315f2b2f1e1c2a8420d8f64d2 Mon Sep 17 00:00:00 2001
From: PeaBrane <yanrpei@gmail.com>
Date: Tue, 1 Jul 2025 12:00:12 -0700
Subject: [PATCH 36/36] more clippy

---
 lib/llm/src/mocker/sequence.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/llm/src/mocker/sequence.rs b/lib/llm/src/mocker/sequence.rs
index e0fb59a417..2145d8e561 100644
--- a/lib/llm/src/mocker/sequence.rs
+++ b/lib/llm/src/mocker/sequence.rs
@@ -318,7 +318,7 @@ mod tests {
         // Verify state after pushing tokens
         assert_eq!(seq1.unique_blocks().len(), 2); // One full block and one partial block
         assert_eq!(seq1.len(), 17);
-        assert_eq!(seq1.len() % (seq1.block_size() as usize), 1);
+        assert_eq!(seq1.len() % seq1.block_size(), 1);
 
         // Create another sequence with block size 16 initialized with tokens [0..17]
         let extended_tokens: Vec<u32> = (0..16).collect();
@@ -367,12 +367,12 @@ mod tests {
             "seq2 should have exactly 3 blocks"
         );
         assert_eq!(
-            seq1.len() % (seq1.block_size() as usize),
+            seq1.len() % seq1.block_size(),
             1,
             "seq1 should have 1 partial token"
         );
         assert_eq!(
-            seq2.len() % (seq2.block_size() as usize),
+            seq2.len() % seq2.block_size(),
             1,
             "seq2 should have 1 partial token"
         );