ai-dynamo · zhongdaor-nv · Sep 18, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
@@ -81,4 +81,4 @@ opt-level = 3
 [profile.release]
 # These make the build much slower but shrink the binary, and could help performance
 codegen-units = 1
-lto = true
+lto = true
@@ -287,6 +287,12 @@ impl ModelWatcher {
             let Some(mut card) = card else {
                 anyhow::bail!("Missing model deployment card");
             };
+
+            // Ensure runtime_config is populated: prefer entry value if present
+            if let Some(rc) = model_entry.runtime_config.clone() {
+                card.runtime_config = rc;
+            }
+
             // Download tokenizer.json etc to local disk
             // This cache_dir is a tempfile::TempDir will be deleted on drop. I _think_
             // OpenAIPreprocessor::new loads the files, so we can delete them after this

@@ -95,6 +95,8 @@ pub struct OpenAIPreprocessor {
     formatter: Arc<dyn OAIPromptFormatter>,
     tokenizer: Arc<dyn Tokenizer>,
     model_info: Arc<dyn ModelInfo>,
+    /// Per-model runtime configuration propagated to response generator (e.g., reasoning/tool parser)
+    runtime_config: crate::local_model::runtime_config::ModelRuntimeConfig,
 }
 
 impl OpenAIPreprocessor {
@@ -120,11 +122,15 @@ impl OpenAIPreprocessor {
         };
         let model_info = model_info.get_model_info()?;
 
+        // Initialize runtime config; allow env override if not provided by backend/card
+        let runtime_config = mdc.runtime_config.clone();
+
         Ok(Arc::new(Self {
             formatter,
             tokenizer,
             model_info,
             mdcsum,
+            runtime_config,
         }))
     }
     /// Encode a string to it's tokens
@@ -580,6 +586,9 @@ impl
         let response_generator = request.response_generator(context.id().to_string());
         let mut response_generator = Box::new(response_generator);
 
+        // set the runtime configuration
+        response_generator.set_runtime_config(self.runtime_config.clone());
+
         // convert the chat completion request to a common completion request
         let (common_request, annotations) = self.preprocess_request(&request)?;
 

@@ -120,6 +120,20 @@ impl DeltaGenerator {
         }
     }
 
+    /// Update runtime configuration and reconfigure the reasoning parser accordingly.
+    pub fn set_runtime_config(&mut self, runtime_config: ModelRuntimeConfig) {
+        self.options.runtime_config = runtime_config.clone();
+        match self.options.runtime_config.reasoning_parser.as_deref() {
+            Some(name) => {
+                self.reasoning_parser =
+                    Some(ReasoningParserType::get_reasoning_parser_from_name(name));
+            }
+            None => {
+                self.reasoning_parser = None;
+            }
+        }
+    }
+
     /// Updates the prompt token usage count.
     ///
     /// # Arguments

@@ -173,9 +173,8 @@ impl ReasoningParser for GptOssReasoningParser {
         }
 
         if let Some(channel) = self.parser.current_channel() {
-            tracing::debug!("Current channel: {}", channel);
+            tracing::debug!("Current channel {}", channel);
             if channel == "final" {
-                tracing::debug!("In final channel, processing normal text");
                 // If we're in the final channel, we should not parse reasoning
                 if let Some(current) = self.parser.last_content_delta().unwrap_or_default() {
                     tracing::debug!("Got normal text delta of {} chars", current.len());
@@ -186,6 +185,54 @@ impl ReasoningParser for GptOssReasoningParser {
                 }
                 tracing::debug!("No content delta in final channel");
                 ParserResult::default()
+            } else if channel == "commentary" {
+                // If we're in the commentary channel, we should return raw token content and recover content that is been comsumed by the parser
+                // so that the tool parser can process it properly
+                if let Ok(enc) = get_harmony_encoding() {
+                    let raw_content = self.parser.current_content().unwrap_or_default();
+                    let mut final_text = _text.to_string();
+
+                    // need to recover content in commentary that is been comsumed by the parser
+                    if raw_content.is_empty() {
+                        let tokens = self.parser.tokens();
+
+                        // Get the token id for " <|channel|>"
+                        let start_token_id = enc
+                            .tokenizer()
+                            .encode_with_special_tokens("<|channel|>")
+                            .last()
+                            .copied();
+
+                        // Find the last occurrence of the <|channel|> token (id 20005) in the tokens vector
+                        let last_channel_toke_idx = start_token_id
+                            .and_then(|token_id| {
+                                tokens.iter().rposition(|token| *token == token_id)
+                            })
+                            .unwrap_or(0);
+
+                        // then get the generate text between the last  <|channel|> to the end of self.parser.tokens()
+                        let end_token_idx = self.parser.tokens().len();
+                        // using the harmony decode_utf8 to translate the tokens to text
+                        let generated_text = enc
+                            .tokenizer()
+                            .decode_utf8(
+                                &self.parser.tokens()[last_channel_toke_idx..end_token_idx],
+                            )
+                            .unwrap();
+
+                        final_text = generated_text;
+
+                        // Mark as processed to prevent running this again
+                    }
+
+                    ParserResult {
+                        normal_text: final_text,
+                        reasoning_text: String::new(),
+                    }
+                } else {
+                    tracing::warn!("Failed to get harmony encoding for raw token decoding");
+                    ParserResult::default()
+                }
             } else {
                 tracing::debug!("In reasoning channel: {}", channel);
                 if let Some(current) = self.parser.last_content_delta().unwrap_or_default() {

@@ -3,9 +3,10 @@
 
 use super::config::JsonParserConfig;
 use super::response::{CalledFunction, ToolCallResponse, ToolCallType};
-use openai_harmony::StreamableParser;
 use openai_harmony::chat::{Content::Text, Role};
-use openai_harmony::{HarmonyEncoding, HarmonyEncodingName, load_harmony_encoding};
+use openai_harmony::{
+    HarmonyEncoding, HarmonyEncodingName, StreamableParser, load_harmony_encoding,
+};
 use serde_json::Value;
 use std::sync::OnceLock;
 
@@ -154,6 +155,109 @@ pub fn parse_tool_calls_harmony(
     Ok((res, Some(normal_text.to_string())))
 }
 
+/// Parse tool calls from a complete Harmony Format text chunk using direct token parsing.
+///
+/// This function is optimized for parsing complete text chunks where the entire content
+/// is available at once. It uses `parse_messages_from_completion_tokens` to directly
+/// parse all tokens into Harmony Format messages, then extracts tool calls from messages
+/// with the "commentary" channel and "functions.*" recipients.
+///
+/// Unlike `parse_tool_calls_harmony`, this function doesn't perform start token detection
+/// or token-by-token streaming, making it more efficient for complete chunks.
+///
+/// # Arguments
+/// * `text` - The complete Harmony Format text to parse
+/// * `config` - Parser configuration (currently unused but kept for API consistency)
+///
+/// # Returns
+/// * `Ok((tool_calls, normal_text))` - Tuple containing extracted tool calls and any normal text
+/// * `Err(e)` - If parsing fails due to encoding or tokenization errors
+///   <|channel|>commentary to=functions.get_current_weather <|constrain|>json<|message|>{"location":"San Francisco"}
+pub fn parse_tool_calls_harmony_complete(
+    text: &str,
+    config: &JsonParserConfig,
+) -> anyhow::Result<(Vec<ToolCallResponse>, Option<String>)> {
+    let _ = config;
+    let enc = match get_harmony_encoding().as_ref() {
+        Ok(e) => e,
+        Err(e) => {
+            tracing::debug!("Failed to load harmony encoding: {e}. Tool calls will not be parsed.");
+            return Ok((vec![], Some(text.to_string())));
+        }
+    };
+
+    // // Encode the text into tokens using harmony encoding
+    let tokens: Vec<u32> = enc.tokenizer().encode_with_special_tokens(text);
+    let messages = match enc.parse_messages_from_completion_tokens(tokens, Some(Role::Assistant)) {
+        Ok(messages) => messages,
+        Err(e) => {
+            tracing::debug!(
+                "Failed to parse messages from completion tokens: {e}. Tool calls will not be parsed."
+            );
+            return Ok((vec![], Some(text.to_string())));
+        }
+    };
+
+    let mut normal_text = String::new();
+
+    let mut res = Vec::with_capacity(messages.len());
+    let mut call_idx = 0usize; // Index of the tool call
+
+    for message in messages.iter() {
+        if message.author.role == Role::Assistant
+            && message.channel.as_deref() == Some("commentary")
+            && message
+                .recipient
+                .as_deref()
+                .unwrap_or_default()
+                .starts_with("functions.")
+        {
+            let Some(fname) = message
+                .recipient
+                .as_ref()
+                .and_then(|r| r.split('.').nth(1))
+                .filter(|s| !s.is_empty())
+                .map(|s| s.to_string())
+            else {
+                continue;
+            };
+
+            let args = match message.content.first() {
+                Some(Text(text)) => match serde_json::from_str::<Value>(text.text.trim()) {
+                    Ok(value) => value,
+                    Err(_) => {
+                        Value::Null // Set args to null if it's not valid JSON
+                    }
+                },
+                _ => {
+                    Value::Null // Set args to null if it's not a text content
+                }
+            };
+            // Add tool call to result if args is valid JSON
+            if !args.is_null() {
+                call_idx += 1;
+                res.push(ToolCallResponse {
+                    id: format!("call-{}", call_idx),
+                    tp: ToolCallType::Function,
+                    function: CalledFunction {
+                        name: fname.to_string(),
+                        // Safety: `Value::Object` is always valid JSON, so serialization cannot fail
+                        arguments: serde_json::to_string(&args).unwrap(),
+                    },
+                });
+            }
+        }
+        if message.author.role == Role::Assistant && message.channel.as_deref() == Some("analysis")
+        {
+            normal_text.push_str(match &message.content[0] {
+                Text(t) => &t.text,
+                _ => "",
+            });
+        }
+    }
+    Ok((res, Some(normal_text.to_string())))
+}
+
 pub fn detect_tool_call_start_harmony(chunk: &str, config: &JsonParserConfig) -> bool {
     let trimmed = chunk.trim();
     if trimmed.is_empty() {

@@ -4,4 +4,6 @@
 pub mod harmony_parser;
 
 pub use super::{config, response};
-pub use harmony_parser::{detect_tool_call_start_harmony, parse_tool_calls_harmony};
+pub use harmony_parser::{
+    detect_tool_call_start_harmony, parse_tool_calls_harmony, parse_tool_calls_harmony_complete,
+};
@@ -11,7 +11,7 @@ pub mod tools;
 
 // Re-export main types and functions for convenience
 pub use config::{JsonParserConfig, ToolCallConfig, ToolCallParserType};
-pub use harmony::parse_tool_calls_harmony;
+pub use harmony::{parse_tool_calls_harmony, parse_tool_calls_harmony_complete};
 pub use json::try_tool_call_parse_json;
 pub use parsers::{detect_and_parse_tool_call, try_tool_call_parse};
 pub use pythonic::try_tool_call_parse_pythonic;

@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use super::config::{ToolCallConfig, ToolCallParserType};
-use super::harmony::{detect_tool_call_start_harmony, parse_tool_calls_harmony};
+use super::harmony::{detect_tool_call_start_harmony, parse_tool_calls_harmony_complete};
 use super::json::{detect_tool_call_start_json, try_tool_call_parse_json};
 use super::pythonic::{detect_tool_call_start_pythonic, try_tool_call_parse_pythonic};
 use super::response::ToolCallResponse;
@@ -43,7 +43,8 @@ pub fn try_tool_call_parse(
             Ok((results, normal_content))
         }
         ToolCallParserType::Harmony => {
-            let (results, normal_content) = parse_tool_calls_harmony(message, &config.json)?;
+            let (results, normal_content) =
+                parse_tool_calls_harmony_complete(message, &config.json)?;
             Ok((results, normal_content))
         }
         ToolCallParserType::Pythonic => {