From 879b1d5af7a7827498afea50d398d845f8a8e21c Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Sun, 16 Nov 2025 01:48:14 +0000 Subject: [PATCH 1/7] feat: support multimodal models with non-standard Jinja2 tags Implement placeholder replacement for non-standard Jinja2 tags (e.g., {% generation %}) that minijinja doesn't recognize but vLLM's custom extensions require. Rust frontend replaces tags with __JINJA_BLOCK___ for validation, Python backend restores them before vLLM processing. Also fix model card parsing for multimodal models: make num_hidden_layers optional and handle eos_token_id as single value or array. Signed-off-by: Keiven Chang --- Cargo.lock | 222 +----------------- .../vllm/multimodal_utils/chat_processor.py | 12 + lib/llm/src/model_card.rs | 26 +- .../prompt/template/formatters.rs | 73 +++++- 4 files changed, 107 insertions(+), 226 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 32fd26794f2..86c26e48f46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -238,18 +238,6 @@ dependencies = [ "pin-project-lite", ] -[[package]] -name = "async-channel" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" -dependencies = [ - "concurrent-queue", - "event-listener-strategy", - "futures-core", - "pin-project-lite", -] - [[package]] name = "async-nats" version = "0.40.0" @@ -2089,16 +2077,6 @@ dependencies = [ "syn 2.0.110", ] -[[package]] -name = "dlpark" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc178fc3bf4ce54c26ccffcf271ff574954ac4b940f15121be3d69f277194537" -dependencies = [ - "half 2.7.1", - "pyo3", -] - [[package]] name = "dlv-list" version = "0.5.2" @@ -4072,15 +4050,6 @@ dependencies = [ "web-time", ] -[[package]] -name = "indoc" -version = "2.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" -dependencies = [ - "rustversion", -] - [[package]] name = "inlinable_string" version = "0.1.15" @@ -4170,15 +4139,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "inventory" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e" -dependencies = [ - "rustversion", -] - [[package]] name = "iovec" version = "0.1.4" @@ -4601,40 +4561,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "kvbm-py3" -version = "0.7.0" -dependencies = [ - "anyhow", - "async-stream", - "async-trait", - "cudarc 0.16.6", - "derive-getters", - "dlpark", - "dynamo-llm", - "dynamo-runtime", - "either", - "futures", - "local-ip-address", - "once_cell", - "prometheus", - "pyo3", - "pyo3-async-runtimes", - "pythonize", - "rand 0.9.2", - "rstest 0.25.0", - "serde", - "serde_json", - "socket2 0.6.1", - "thiserror 2.0.17", - "tokio", - "tokio-stream", - "tokio-util", - "tracing", - "tracing-subscriber", - "uuid 1.18.1", -] - [[package]] name = "lalrpop-util" version = "0.20.2" @@ -5004,15 +4930,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - [[package]] name = "metal" version = "0.27.0" @@ -5581,7 +5498,7 @@ dependencies = [ "bitflags 1.3.2", "cfg-if 1.0.4", "libc", - "memoffset 0.7.1", + "memoffset", "pin-utils", ] @@ -6751,107 +6668,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "pyo3" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" -dependencies = [ - "cfg-if 1.0.4", - "indoc", - "libc", - "memoffset 0.9.1", - "once_cell", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-async-runtimes" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e" -dependencies = [ - "async-channel", - "clap 4.5.51", - "futures", - "inventory", - "once_cell", - "pin-project-lite", - "pyo3", - "pyo3-async-runtimes-macros", - "tokio", -] - -[[package]] -name = "pyo3-async-runtimes-macros" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2df2884957d2476731f987673befac5d521dff10abb0a7cbe12015bc7702fe9" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.110", -] - -[[package]] -name = "pyo3-build-config" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn 2.0.110", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn 2.0.110", -] - -[[package]] -name = "pythonize" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff" -dependencies = [ - "pyo3", - "serde", -] - [[package]] name = "qoi" version = "0.4.1" @@ -7442,18 +7258,6 @@ dependencies = [ "rustc_version", ] -[[package]] -name = "rstest" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fc39292f8613e913f7df8fa892b8944ceb47c247b78e1b1ae2f09e019be789d" -dependencies = [ - "futures-timer", - "futures-util", - "rstest_macros 0.25.0", - "rustc_version", -] - [[package]] name = "rstest_macros" version = "0.18.2" @@ -7489,24 +7293,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "rstest_macros" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746" -dependencies = [ - "cfg-if 1.0.4", - "glob", - "proc-macro-crate", - "proc-macro2", - "quote", - "regex", - "relative-path", - "rustc_version", - "syn 2.0.110", - "unicode-ident", -] - [[package]] name = "rstest_reuse" version = "0.7.0" @@ -9979,12 +9765,6 @@ dependencies = [ "rand 0.8.5", ] -[[package]] -name = "unindent" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - [[package]] name = "unsafe-libyaml" version = "0.2.11" diff --git a/components/src/dynamo/vllm/multimodal_utils/chat_processor.py b/components/src/dynamo/vllm/multimodal_utils/chat_processor.py index fe8d95dc81d..5ef4305aea4 100644 --- a/components/src/dynamo/vllm/multimodal_utils/chat_processor.py +++ b/components/src/dynamo/vllm/multimodal_utils/chat_processor.py @@ -146,6 +146,18 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu else: chat_template = request.chat_template or self.tokenizer.chat_template + # Restore non-standard Jinja2 tags that were replaced with placeholders by Rust frontend + # for minijinja validation. Backend frameworks (like vLLM) may have custom extensions for these. + if chat_template: + import re + + # Restore all __JINJA_BLOCK___ placeholders back to {% tag %} + chat_template = re.sub( + r"__JINJA_BLOCK_([A-Z_]+)__", + lambda m: "{% " + m.group(1).lower() + " %}", + chat_template, + ) + ( conversation, request_prompts, diff --git a/lib/llm/src/model_card.rs b/lib/llm/src/model_card.rs index 0674e1c0a16..3d6250ad420 100644 --- a/lib/llm/src/model_card.rs +++ b/lib/llm/src/model_card.rs @@ -621,7 +621,8 @@ struct HFTextConfig { max_position_embeddings: Option, /// number of layers in the model - num_hidden_layers: usize, + /// Optional because some multimodal models (e.g., LLaVA) don't include this in text_config + num_hidden_layers: Option, /// number of attention heads in the model num_attention_heads: Option, @@ -701,11 +702,32 @@ impl HFConfig { }) .or_else(|| { // Maybe it's in generation_config.json - crate::file_json_field(&gencfg_path, "eos_token_id") + crate::file_json_field::(&gencfg_path, "eos_token_id") .inspect_err( |err| tracing::warn!(%err, "Missing eos_token_id in generation_config.json"), ) .ok() + .and_then(|v| { + if v.is_number() { + v.as_number() + .and_then(|n| n.as_u64()) + .map(|n| vec![n as TokenIdType]) + } else if v.is_array() { + let arr = v.as_array().unwrap(); + Some( + arr.iter() + .filter_map(|inner_v| { + inner_v + .as_number() + .and_then(|n| n.as_u64()) + .map(|n| n as TokenIdType) + }) + .collect(), + ) + } else { + None + } + }) }) .ok_or_else(|| { anyhow::anyhow!( diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index 82a63a6bf09..5382562fa3c 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -9,6 +9,67 @@ use either::Either; use minijinja::{Environment, Value}; use tracing; +/// Replace non-standard Jinja2 block tags with placeholders +/// +/// minijinja doesn't expose its tag list publicly - they're hardcoded in a private match statement +/// in the parser. This list is derived from minijinja v2.12.0's parser.rs implementation. +/// See: https://github.com/mitsuhiko/minijinja/blob/main/minijinja/src/compiler/parser.rs#L542 +fn replace_non_standard_blocks(template: &str) -> String { + use regex::Regex; + + // Standard Jinja2/minijinja tags (cannot be queried from minijinja API) + let standard_keywords = [ + "for", + "endfor", + "if", + "elif", + "else", + "endif", + "block", + "endblock", + "extends", + "include", + "import", + "from", + "macro", + "endmacro", + "call", + "endcall", + "set", + "endset", + "with", + "endwith", + "filter", + "endfilter", + "autoescape", + "endautoescape", + "raw", + "endraw", + "do", + ]; + + let re = Regex::new(r"\{%\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*%\}").unwrap(); + let mut result = template.to_string(); + let mut replacements = Vec::new(); + + for cap in re.captures_iter(template) { + let full_match = cap.get(0).unwrap().as_str(); + let tag_name = cap.get(1).unwrap().as_str(); + + if !standard_keywords.contains(&tag_name) { + // Non-standard tag (e.g., vLLM's {% generation %}) - replace with placeholder + let placeholder = format!("__JINJA_BLOCK_{}", tag_name.to_uppercase()); + replacements.push((full_match.to_string(), placeholder)); + } + } + + for (original, placeholder) in replacements { + result = result.replace(&original, &placeholder); + } + + result +} + impl JinjaEnvironment { fn env(self) -> Environment<'static> { self.env @@ -64,8 +125,12 @@ impl HfTokenizerConfigJsonFormatter { ); supports_add_generation_prompt = Some(true); } - env.add_template_owned("default", x.to_string())?; - env.add_template_owned("tool_use", x.to_string())?; + // Replace non-standard Jinja2 block tags with placeholders for minijinja validation + // Standard Jinja2/minijinja blocks: for, if, block, macro, call, filter, set, with, autoescape, trans + // Any other {% tag %} blocks are likely backend-specific extensions (like vLLM's {% generation %}) + let template_for_validation = replace_non_standard_blocks(x); + env.add_template_owned("default", template_for_validation.clone())?; + env.add_template_owned("tool_use", template_for_validation)?; } Either::Right(map) => { for t in map { @@ -87,7 +152,9 @@ impl HfTokenizerConfigJsonFormatter { } else { supports_add_generation_prompt = Some(false); } - env.add_template_owned(k.to_string(), v.to_string())?; + // Replace non-standard Jinja2 block tags with placeholders for minijinja validation + let template_for_validation = replace_non_standard_blocks(v); + env.add_template_owned(k.to_string(), template_for_validation)?; } } if env.templates().count() == 0 { From d5eaec9c00087f9512dca46a040ebc582bc7c5a5 Mon Sep 17 00:00:00 2001 From: Krishnan Prashanth Date: Mon, 17 Nov 2025 04:31:50 -0800 Subject: [PATCH 2/7] Handle chat templates that expect message[content] be a string Signed-off-by: Krishnan Prashanth --- lib/llm/src/preprocessor/prompt/template.rs | 1 + .../prompt/template/formatters.rs | 47 ++++++ .../src/preprocessor/prompt/template/oai.rs | 148 ++++++++++++++---- 3 files changed, 168 insertions(+), 28 deletions(-) diff --git a/lib/llm/src/preprocessor/prompt/template.rs b/lib/llm/src/preprocessor/prompt/template.rs index 87748046ae0..8ea6f714942 100644 --- a/lib/llm/src/preprocessor/prompt/template.rs +++ b/lib/llm/src/preprocessor/prompt/template.rs @@ -106,6 +106,7 @@ struct HfTokenizerConfigJsonFormatter { config: ChatTemplate, mixins: Arc, supports_add_generation_prompt: bool, + requires_content_arrays: bool, } // /// OpenAI Standard Prompt Formatter diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index 5382562fa3c..edb93eb8963 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -70,6 +70,44 @@ fn replace_non_standard_blocks(template: &str) -> String { result } +/// Detects whether a chat template requires message content as arrays (multimodal) +/// or accepts simple strings (standard text-only templates). +/// +/// This function test-renders the template with both formats: +/// - Array format: `[{"type": "text", "text": "X"}]` +/// - String format: `"X"` +/// +/// If the array format works but string format doesn't produce output, +/// the template requires arrays (e.g., llava, Qwen-VL multimodal templates). +fn detect_content_array_usage(env: &Environment) -> bool { + use minijinja::context; + use serde_json::json; + + // Test with array format + let test_array = context! { + messages => json!([{"role": "user", "content": [{"type": "text", "text": "X"}]}]), + add_generation_prompt => false, + }; + + // Test with string format + let test_string = context! { + messages => json!([{"role": "user", "content": "X"}]), + add_generation_prompt => false, + }; + + let out_array = env + .get_template("default") + .and_then(|t| t.render(&test_array)) + .unwrap_or_default(); + let out_string = env + .get_template("default") + .and_then(|t| t.render(&test_string)) + .unwrap_or_default(); + + // If array works but string doesn't, template requires arrays + out_array.contains("X") && !out_string.contains("X") +} + impl JinjaEnvironment { fn env(self) -> Environment<'static> { self.env @@ -165,11 +203,20 @@ impl HfTokenizerConfigJsonFormatter { } } + // Detect at model load time whether this template requires content arrays + let requires_content_arrays = detect_content_array_usage(&env); + + tracing::info!( + "Template analysis: requires_content_arrays = {}", + requires_content_arrays + ); + Ok(HfTokenizerConfigJsonFormatter { env, config, mixins: Arc::new(mixins), supports_add_generation_prompt: supports_add_generation_prompt.unwrap_or(false), + requires_content_arrays, }) } } diff --git a/lib/llm/src/preprocessor/prompt/template/oai.rs b/lib/llm/src/preprocessor/prompt/template/oai.rs index 9d02e3a6980..4de38a020e5 100644 --- a/lib/llm/src/preprocessor/prompt/template/oai.rs +++ b/lib/llm/src/preprocessor/prompt/template/oai.rs @@ -73,10 +73,11 @@ fn may_be_fix_tool_schema(tools: serde_json::Value) -> Option { Some(Value::from_serialize(&updated_tools)) } -fn may_be_fix_msg_content(messages: serde_json::Value) -> Value { - // If messages[content] is provided as a list containing ONLY text parts, - // concatenate them into a string to match chat template expectations. - // Mixed content types are left for chat templates to handle. +fn may_be_fix_msg_content(messages: serde_json::Value, preserve_arrays: bool) -> Value { + // Bidirectional normalization for message content format: + // - If preserve_arrays=true (multimodal templates): Convert strings → arrays + // - If preserve_arrays=false (standard templates): Flatten text-only arrays → strings + // - Mixed content types are always preserved as-is let Some(arr) = messages.as_array() else { return Value::from_serialize(&messages); @@ -86,7 +87,20 @@ fn may_be_fix_msg_content(messages: serde_json::Value) -> Value { .iter() .map(|msg| { match msg.get("content") { - Some(serde_json::Value::Array(content_array)) => { + // Case 1: String → Array (for multimodal templates) + Some(serde_json::Value::String(text)) if preserve_arrays => { + let mut modified_msg = msg.clone(); + if let Some(msg_object) = modified_msg.as_object_mut() { + let content_array = serde_json::json!([{ + "type": "text", + "text": text + }]); + msg_object.insert("content".to_string(), content_array); + } + modified_msg + } + // Case 2: Array → String (for standard templates) + Some(serde_json::Value::Array(content_array)) if !preserve_arrays => { let is_text_only_array = !content_array.is_empty() && content_array.iter().all(|part| { part.get("type") @@ -159,19 +173,7 @@ impl OAIChatLikeRequest for NvCreateChatCompletionRequest { fn messages(&self) -> Value { let messages_json = serde_json::to_value(&self.inner.messages).unwrap(); - - let needs_fixing = if let Some(arr) = messages_json.as_array() { - arr.iter() - .any(|msg| msg.get("content").and_then(|c| c.as_array()).is_some()) - } else { - false - }; - - if needs_fixing { - may_be_fix_msg_content(messages_json) - } else { - Value::from_serialize(&messages_json) - } + Value::from_serialize(&messages_json) } fn tools(&self) -> Option { @@ -301,6 +303,15 @@ impl OAIPromptFormatter for HfTokenizerConfigJsonFormatter { let messages_canonical = req.messages(); let mut messages_for_template: serde_json::Value = serde_json::to_value(&messages_canonical).unwrap(); + + // Apply bidirectional content normalization based on template requirements + let preserve_arrays = self.requires_content_arrays; + messages_for_template = serde_json::to_value(may_be_fix_msg_content( + messages_for_template, + preserve_arrays, + )) + .unwrap(); + normalize_tool_arguments_in_messages(&mut messages_for_template); let ctx = context! { @@ -457,7 +468,10 @@ mod tests { }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Test array → string normalization (preserve_arrays=false for standard templates) + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Verify: text-only array is concatenated into a single string assert_eq!( @@ -500,7 +514,10 @@ mod tests { }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Test array → string normalization (preserve_arrays=false for standard templates) + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Verify: System message with string content remains unchanged assert_eq!( @@ -541,7 +558,10 @@ mod tests { }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Empty arrays should be preserved regardless of preserve_arrays setting + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Verify: Empty arrays are preserved as-is assert!(messages[0]["content"].is_array()); @@ -562,7 +582,10 @@ mod tests { }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Test with preserve_arrays=false (standard templates) + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Verify: String content is not modified assert_eq!( @@ -589,7 +612,10 @@ mod tests { }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Mixed content should be preserved regardless of preserve_arrays setting + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Verify: Mixed content types are preserved as array for template handling assert!(messages[0]["content"].is_array()); @@ -617,7 +643,10 @@ mod tests { }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Non-text arrays should be preserved regardless of preserve_arrays setting + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Verify: Non-text content arrays are preserved for template handling assert!(messages[0]["content"].is_array()); @@ -713,7 +742,8 @@ NORMAL MODE }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Mixed types should preserve array structure assert!(messages[0]["content"].is_array()); @@ -735,7 +765,8 @@ NORMAL MODE }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); // Unknown types mixed with text should preserve array assert!(messages[0]["content"].is_array()); @@ -873,11 +904,15 @@ NORMAL MODE }"#; let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); - let mut messages = serde_json::to_value(request.messages()).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Apply content normalization with preserve_arrays=false (standard templates) + let mut messages = + serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap(); normalize_tool_arguments_in_messages(&mut messages); - // Multimodal content preserved as array + // Multimodal content preserved as array (mixed types not flattened) assert!(messages[0]["content"].is_array()); assert_eq!(messages[0]["content"].as_array().unwrap().len(), 3); @@ -889,6 +924,63 @@ NORMAL MODE ); } + /// Tests string → array normalization for multimodal templates + #[test] + fn test_may_be_fix_msg_content_string_to_array() { + let json_str = r#"{ + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ] + }"#; + + let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Test with preserve_arrays=true (multimodal templates) + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, true)).unwrap(); + + // Verify: String is converted to array format + assert!(messages[0]["content"].is_array()); + let content_array = messages[0]["content"].as_array().unwrap(); + assert_eq!(content_array.len(), 1); + assert_eq!(content_array[0]["type"], "text"); + assert_eq!(content_array[0]["text"], "Hello, how are you?"); + } + + /// Tests that arrays are preserved when preserve_arrays=true + #[test] + fn test_may_be_fix_msg_content_array_preserved_with_multimodal() { + let json_str = r#"{ + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "part 1"}, + {"type": "text", "text": "part 2"} + ] + } + ] + }"#; + + let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap(); + let messages_raw = serde_json::to_value(request.messages()).unwrap(); + + // Test with preserve_arrays=true (multimodal templates) + let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, true)).unwrap(); + + // Verify: Array is preserved as-is + assert!(messages[0]["content"].is_array()); + let content_array = messages[0]["content"].as_array().unwrap(); + assert_eq!(content_array.len(), 2); + assert_eq!(content_array[0]["text"], "part 1"); + assert_eq!(content_array[1]["text"], "part 2"); + } + fn user() -> Msg { Msg::User(Default::default()) } From 7e662ce7399f66f2d2deec1075ccd4257352a8c9 Mon Sep 17 00:00:00 2001 From: Krishnan Prashanth Date: Mon, 17 Nov 2025 04:59:12 -0800 Subject: [PATCH 3/7] Cleaning up comments + variable names + structure Signed-off-by: Krishnan Prashanth --- .../prompt/template/formatters.rs | 25 +++++++++---------- .../src/preprocessor/prompt/template/oai.rs | 4 +-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index edb93eb8963..03298ec9f3b 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -6,8 +6,9 @@ use std::sync::Arc; use super::tokcfg::{ChatTemplate, raise_exception, strftime_now, tojson}; use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment}; use either::Either; -use minijinja::{Environment, Value}; +use minijinja::{Environment, Value, context}; use tracing; +use serde_json::json; /// Replace non-standard Jinja2 block tags with placeholders /// @@ -74,38 +75,36 @@ fn replace_non_standard_blocks(template: &str) -> String { /// or accepts simple strings (standard text-only templates). /// /// This function test-renders the template with both formats: -/// - Array format: `[{"type": "text", "text": "X"}]` -/// - String format: `"X"` +/// - Array format: `[{"type": "text", "text": "template_test"}]` +/// - String format: `"template_test"` /// /// If the array format works but string format doesn't produce output, -/// the template requires arrays (e.g., llava, Qwen-VL multimodal templates). +/// the template requires arrays (multimodal templates). fn detect_content_array_usage(env: &Environment) -> bool { - use minijinja::context; - use serde_json::json; // Test with array format - let test_array = context! { - messages => json!([{"role": "user", "content": [{"type": "text", "text": "X"}]}]), + let array_msg = context! { + messages => json!([{"role": "user", "content": [{"type": "text", "text": "template_test"}]}]), add_generation_prompt => false, }; // Test with string format - let test_string = context! { - messages => json!([{"role": "user", "content": "X"}]), + let string_msg = context! { + messages => json!([{"role": "user", "content": "template_test"}]), add_generation_prompt => false, }; let out_array = env .get_template("default") - .and_then(|t| t.render(&test_array)) + .and_then(|t| t.render(&array_msg)) .unwrap_or_default(); let out_string = env .get_template("default") - .and_then(|t| t.render(&test_string)) + .and_then(|t| t.render(&string_msg)) .unwrap_or_default(); // If array works but string doesn't, template requires arrays - out_array.contains("X") && !out_string.contains("X") + out_array.contains("template_test") && !out_string.contains("template_test") } impl JinjaEnvironment { diff --git a/lib/llm/src/preprocessor/prompt/template/oai.rs b/lib/llm/src/preprocessor/prompt/template/oai.rs index 4de38a020e5..d041ebf5ff3 100644 --- a/lib/llm/src/preprocessor/prompt/template/oai.rs +++ b/lib/llm/src/preprocessor/prompt/template/oai.rs @@ -87,7 +87,7 @@ fn may_be_fix_msg_content(messages: serde_json::Value, preserve_arrays: bool) -> .iter() .map(|msg| { match msg.get("content") { - // Case 1: String → Array (for multimodal templates) + // Case 1: String to Array (for multimodal templates) Some(serde_json::Value::String(text)) if preserve_arrays => { let mut modified_msg = msg.clone(); if let Some(msg_object) = modified_msg.as_object_mut() { @@ -99,7 +99,7 @@ fn may_be_fix_msg_content(messages: serde_json::Value, preserve_arrays: bool) -> } modified_msg } - // Case 2: Array → String (for standard templates) + // Case 2: Array to String (for standard templates) Some(serde_json::Value::Array(content_array)) if !preserve_arrays => { let is_text_only_array = !content_array.is_empty() && content_array.iter().all(|part| { From fdfc7dac33d0e0ec102ae78dc3c2613461bf8796 Mon Sep 17 00:00:00 2001 From: Krishnan Prashanth Date: Mon, 17 Nov 2025 06:14:29 -0800 Subject: [PATCH 4/7] fix: cargo_fmt fix Signed-off-by: Krishnan Prashanth --- lib/llm/src/preprocessor/prompt/template/formatters.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index 03298ec9f3b..225a341b5ad 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -7,8 +7,8 @@ use super::tokcfg::{ChatTemplate, raise_exception, strftime_now, tojson}; use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment}; use either::Either; use minijinja::{Environment, Value, context}; -use tracing; use serde_json::json; +use tracing; /// Replace non-standard Jinja2 block tags with placeholders /// @@ -81,7 +81,6 @@ fn replace_non_standard_blocks(template: &str) -> String { /// If the array format works but string format doesn't produce output, /// the template requires arrays (multimodal templates). fn detect_content_array_usage(env: &Environment) -> bool { - // Test with array format let array_msg = context! { messages => json!([{"role": "user", "content": [{"type": "text", "text": "template_test"}]}]), From ef0165f2b6bdba6cb6b390d02b79d1c41655f1d8 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Mon, 17 Nov 2025 23:38:11 +0000 Subject: [PATCH 5/7] fix: Handle two non Jinja2 tags in chat templates - Add remove_known_non_jinja2_tags to strip {% generation %} tags before minijinja validation - Fixes LLaVA and other models using vLLM-specific template extensions - Make num_hidden_layers optional in model card parsing for multimodal compatibility - Handle eos_token_id as single value or array in generation_config.json Signed-off-by: Keiven Chang --- lib/llm/src/model_card.rs | 26 +++++++++- .../prompt/template/formatters.rs | 52 +++++++++++++++++-- 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/lib/llm/src/model_card.rs b/lib/llm/src/model_card.rs index 0674e1c0a16..3d6250ad420 100644 --- a/lib/llm/src/model_card.rs +++ b/lib/llm/src/model_card.rs @@ -621,7 +621,8 @@ struct HFTextConfig { max_position_embeddings: Option, /// number of layers in the model - num_hidden_layers: usize, + /// Optional because some multimodal models (e.g., LLaVA) don't include this in text_config + num_hidden_layers: Option, /// number of attention heads in the model num_attention_heads: Option, @@ -701,11 +702,32 @@ impl HFConfig { }) .or_else(|| { // Maybe it's in generation_config.json - crate::file_json_field(&gencfg_path, "eos_token_id") + crate::file_json_field::(&gencfg_path, "eos_token_id") .inspect_err( |err| tracing::warn!(%err, "Missing eos_token_id in generation_config.json"), ) .ok() + .and_then(|v| { + if v.is_number() { + v.as_number() + .and_then(|n| n.as_u64()) + .map(|n| vec![n as TokenIdType]) + } else if v.is_array() { + let arr = v.as_array().unwrap(); + Some( + arr.iter() + .filter_map(|inner_v| { + inner_v + .as_number() + .and_then(|n| n.as_u64()) + .map(|n| n as TokenIdType) + }) + .collect(), + ) + } else { + None + } + }) }) .ok_or_else(|| { anyhow::anyhow!( diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index 82a63a6bf09..0289300ef69 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -9,6 +9,21 @@ use either::Either; use minijinja::{Environment, Value}; use tracing; +/// Remove known non-standard Jinja2 tags from chat templates +/// +/// Some models use custom Jinja2 extensions that minijinja doesn't recognize. These tags +/// are typically metadata markers that don't affect the rendered output. For example: +/// - {% generation %} / {% endgeneration %}: Used by vLLM's AssistantTracker to mark +/// assistant-generated content. The tags themselves don't produce output. +/// +/// By removing these tags before validation, we allow templates with backend-specific +/// extensions to work with minijinja while maintaining correct output semantics. +fn remove_known_non_jinja2_tags(template: &str) -> String { + template + .replace("{% generation %}", "") + .replace("{% endgeneration %}", "") +} + impl JinjaEnvironment { fn env(self) -> Environment<'static> { self.env @@ -64,8 +79,10 @@ impl HfTokenizerConfigJsonFormatter { ); supports_add_generation_prompt = Some(true); } - env.add_template_owned("default", x.to_string())?; - env.add_template_owned("tool_use", x.to_string())?; + // Remove known non-standard tags before validation (they don't affect output) + let template_cleaned = remove_known_non_jinja2_tags(x); + env.add_template_owned("default", template_cleaned.clone())?; + env.add_template_owned("tool_use", template_cleaned)?; } Either::Right(map) => { for t in map { @@ -87,7 +104,9 @@ impl HfTokenizerConfigJsonFormatter { } else { supports_add_generation_prompt = Some(false); } - env.add_template_owned(k.to_string(), v.to_string())?; + // Remove known non-standard tags before validation (they don't affect output) + let template_cleaned = remove_known_non_jinja2_tags(v); + env.add_template_owned(k.to_string(), template_cleaned)?; } } if env.templates().count() == 0 { @@ -117,3 +136,30 @@ impl HfTokenizerConfigJsonFormatter { // // fn apply_tool_template() // } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_remove_known_non_jinja2_tags() { + let template = + "USER: {{ message }} ASSISTANT: {% generation %}Reply here{% endgeneration %}"; + let result = remove_known_non_jinja2_tags(template); + assert_eq!(result, "USER: {{ message }} ASSISTANT: Reply here"); + } + + #[test] + fn test_remove_known_non_jinja2_tags_preserves_standard_tags() { + let template = "{% for item in items %}{{ item }}{% endfor %}"; + let result = remove_known_non_jinja2_tags(template); + assert_eq!(result, template); + } + + #[test] + fn test_remove_known_non_jinja2_tags_multiple() { + let template = "Start {% generation %}Part 1{% endgeneration %} middle {% generation %}Part 2{% endgeneration %}"; + let result = remove_known_non_jinja2_tags(template); + assert_eq!(result, "Start Part 1 middle Part 2"); + } +} From 9b823049fff68084800bf0ea3c87c8d3f37111b2 Mon Sep 17 00:00:00 2001 From: Krishnan Prashanth Date: Tue, 18 Nov 2025 15:37:02 -0800 Subject: [PATCH 6/7] Fixing unintended changes Signed-off-by: Krishnan Prashanth --- .../prompt/template/formatters.rs | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs index 86ab3c73b4c..9f5bcb981c7 100644 --- a/lib/llm/src/preprocessor/prompt/template/formatters.rs +++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs @@ -25,6 +25,41 @@ fn remove_known_non_jinja2_tags(template: &str) -> String { .replace("{% endgeneration %}", "") } +/// Detects whether a chat template requires message content as arrays (multimodal) +/// or accepts simple strings (standard text-only templates). +/// +/// This function test-renders the template with both formats: +/// - Array format: `[{"type": "text", "text": "template_test"}]` +/// - String format: `"template_test"` +/// +/// If the array format works but string format doesn't produce output, +/// the template requires arrays (multimodal templates). +fn detect_content_array_usage(env: &Environment) -> bool { + // Test with array format + let array_msg = context! { + messages => json!([{"role": "user", "content": [{"type": "text", "text": "template_test"}]}]), + add_generation_prompt => false, + }; + + // Test with string format + let string_msg = context! { + messages => json!([{"role": "user", "content": "template_test"}]), + add_generation_prompt => false, + }; + + let out_array = env + .get_template("default") + .and_then(|t| t.render(&array_msg)) + .unwrap_or_default(); + let out_string = env + .get_template("default") + .and_then(|t| t.render(&string_msg)) + .unwrap_or_default(); + + // If array works but string doesn't, template requires arrays + out_array.contains("template_test") && !out_string.contains("template_test") +} + impl JinjaEnvironment { fn env(self) -> Environment<'static> { self.env From fbe3d6875f90963a66abf6abe5588d5a44a60fb6 Mon Sep 17 00:00:00 2001 From: Krishnan Prashanth Date: Tue, 18 Nov 2025 15:47:47 -0800 Subject: [PATCH 7/7] Adding Llava test Signed-off-by: Krishnan Prashanth --- tests/serve/test_vllm.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py index d14ea486882..044d7a2c9d4 100644 --- a/tests/serve/test_vllm.py +++ b/tests/serve/test_vllm.py @@ -217,6 +217,33 @@ class VLLMConfig(EngineConfig): ), ], ), + "multimodal_agg_llava": VLLMConfig( + name="multimodal_agg_llava", + directory=vllm_dir, + script_name="agg_multimodal.sh", + marks=[pytest.mark.gpu_2], + model="llava-hf/llava-1.5-7b-hf", + script_args=["--model", "llava-hf/llava-1.5-7b-hf"], + delayed_start=0, + timeout=360, + request_payloads=[ + # HTTP URL test + chat_payload( + [ + {"type": "text", "text": "What is in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "http://images.cocodataset.org/test2017/000000155781.jpg" + }, + }, + ], + repeat_count=1, + expected_response=["bus"], + temperature=0.0, + ), + ], + ), # TODO: Update this test case when we have video multimodal support in vllm official components "multimodal_video_agg": VLLMConfig( name="multimodal_video_agg",