From 879b1d5af7a7827498afea50d398d845f8a8e21c Mon Sep 17 00:00:00 2001
From: Keiven Chang <keivenchang@users.noreply.github.com>
Date: Sun, 16 Nov 2025 01:48:14 +0000
Subject: [PATCH 1/7] feat: support multimodal models with non-standard Jinja2
 tags

Implement placeholder replacement for non-standard Jinja2 tags (e.g., {% generation %})
that minijinja doesn't recognize but vLLM's custom extensions require.

Rust frontend replaces tags with __JINJA_BLOCK_<TAG>__ for validation, Python backend
restores them before vLLM processing. Also fix model card parsing for multimodal models:
make num_hidden_layers optional and handle eos_token_id as single value or array.

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
---
 Cargo.lock                                    | 222 +-----------------
 .../vllm/multimodal_utils/chat_processor.py   |  12 +
 lib/llm/src/model_card.rs                     |  26 +-
 .../prompt/template/formatters.rs             |  73 +++++-
 4 files changed, 107 insertions(+), 226 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 32fd26794f2..86c26e48f46 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -238,18 +238,6 @@ dependencies = [
  "pin-project-lite",
 ]
 
-[[package]]
-name = "async-channel"
-version = "2.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2"
-dependencies = [
- "concurrent-queue",
- "event-listener-strategy",
- "futures-core",
- "pin-project-lite",
-]
-
 [[package]]
 name = "async-nats"
 version = "0.40.0"
@@ -2089,16 +2077,6 @@ dependencies = [
  "syn 2.0.110",
 ]
 
-[[package]]
-name = "dlpark"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc178fc3bf4ce54c26ccffcf271ff574954ac4b940f15121be3d69f277194537"
-dependencies = [
- "half 2.7.1",
- "pyo3",
-]
-
 [[package]]
 name = "dlv-list"
 version = "0.5.2"
@@ -4072,15 +4050,6 @@ dependencies = [
  "web-time",
 ]
 
-[[package]]
-name = "indoc"
-version = "2.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
-dependencies = [
- "rustversion",
-]
-
 [[package]]
 name = "inlinable_string"
 version = "0.1.15"
@@ -4170,15 +4139,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "inventory"
-version = "0.3.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e"
-dependencies = [
- "rustversion",
-]
-
 [[package]]
 name = "iovec"
 version = "0.1.4"
@@ -4601,40 +4561,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "kvbm-py3"
-version = "0.7.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "async-trait",
- "cudarc 0.16.6",
- "derive-getters",
- "dlpark",
- "dynamo-llm",
- "dynamo-runtime",
- "either",
- "futures",
- "local-ip-address",
- "once_cell",
- "prometheus",
- "pyo3",
- "pyo3-async-runtimes",
- "pythonize",
- "rand 0.9.2",
- "rstest 0.25.0",
- "serde",
- "serde_json",
- "socket2 0.6.1",
- "thiserror 2.0.17",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-subscriber",
- "uuid 1.18.1",
-]
-
 [[package]]
 name = "lalrpop-util"
 version = "0.20.2"
@@ -5004,15 +4930,6 @@ dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "memoffset"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "metal"
 version = "0.27.0"
@@ -5581,7 +5498,7 @@ dependencies = [
  "bitflags 1.3.2",
  "cfg-if 1.0.4",
  "libc",
- "memoffset 0.7.1",
+ "memoffset",
  "pin-utils",
 ]
 
@@ -6751,107 +6668,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "pyo3"
-version = "0.23.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
-dependencies = [
- "cfg-if 1.0.4",
- "indoc",
- "libc",
- "memoffset 0.9.1",
- "once_cell",
- "portable-atomic",
- "pyo3-build-config",
- "pyo3-ffi",
- "pyo3-macros",
- "unindent",
-]
-
-[[package]]
-name = "pyo3-async-runtimes"
-version = "0.23.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e"
-dependencies = [
- "async-channel",
- "clap 4.5.51",
- "futures",
- "inventory",
- "once_cell",
- "pin-project-lite",
- "pyo3",
- "pyo3-async-runtimes-macros",
- "tokio",
-]
-
-[[package]]
-name = "pyo3-async-runtimes-macros"
-version = "0.23.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2df2884957d2476731f987673befac5d521dff10abb0a7cbe12015bc7702fe9"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.110",
-]
-
-[[package]]
-name = "pyo3-build-config"
-version = "0.23.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
-dependencies = [
- "once_cell",
- "target-lexicon",
-]
-
-[[package]]
-name = "pyo3-ffi"
-version = "0.23.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
-dependencies = [
- "libc",
- "pyo3-build-config",
-]
-
-[[package]]
-name = "pyo3-macros"
-version = "0.23.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
-dependencies = [
- "proc-macro2",
- "pyo3-macros-backend",
- "quote",
- "syn 2.0.110",
-]
-
-[[package]]
-name = "pyo3-macros-backend"
-version = "0.23.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
-dependencies = [
- "heck 0.5.0",
- "proc-macro2",
- "pyo3-build-config",
- "quote",
- "syn 2.0.110",
-]
-
-[[package]]
-name = "pythonize"
-version = "0.23.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff"
-dependencies = [
- "pyo3",
- "serde",
-]
-
 [[package]]
 name = "qoi"
 version = "0.4.1"
@@ -7442,18 +7258,6 @@ dependencies = [
  "rustc_version",
 ]
 
-[[package]]
-name = "rstest"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fc39292f8613e913f7df8fa892b8944ceb47c247b78e1b1ae2f09e019be789d"
-dependencies = [
- "futures-timer",
- "futures-util",
- "rstest_macros 0.25.0",
- "rustc_version",
-]
-
 [[package]]
 name = "rstest_macros"
 version = "0.18.2"
@@ -7489,24 +7293,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "rstest_macros"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746"
-dependencies = [
- "cfg-if 1.0.4",
- "glob",
- "proc-macro-crate",
- "proc-macro2",
- "quote",
- "regex",
- "relative-path",
- "rustc_version",
- "syn 2.0.110",
- "unicode-ident",
-]
-
 [[package]]
 name = "rstest_reuse"
 version = "0.7.0"
@@ -9979,12 +9765,6 @@ dependencies = [
  "rand 0.8.5",
 ]
 
-[[package]]
-name = "unindent"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
-
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"
diff --git a/components/src/dynamo/vllm/multimodal_utils/chat_processor.py b/components/src/dynamo/vllm/multimodal_utils/chat_processor.py
index fe8d95dc81d..5ef4305aea4 100644
--- a/components/src/dynamo/vllm/multimodal_utils/chat_processor.py
+++ b/components/src/dynamo/vllm/multimodal_utils/chat_processor.py
@@ -146,6 +146,18 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu
         else:
             chat_template = request.chat_template or self.tokenizer.chat_template
 
+        # Restore non-standard Jinja2 tags that were replaced with placeholders by Rust frontend
+        # for minijinja validation. Backend frameworks (like vLLM) may have custom extensions for these.
+        if chat_template:
+            import re
+
+            # Restore all __JINJA_BLOCK_<TAG>__ placeholders back to {% tag %}
+            chat_template = re.sub(
+                r"__JINJA_BLOCK_([A-Z_]+)__",
+                lambda m: "{% " + m.group(1).lower() + " %}",
+                chat_template,
+            )
+
         (
             conversation,
             request_prompts,
diff --git a/lib/llm/src/model_card.rs b/lib/llm/src/model_card.rs
index 0674e1c0a16..3d6250ad420 100644
--- a/lib/llm/src/model_card.rs
+++ b/lib/llm/src/model_card.rs
@@ -621,7 +621,8 @@ struct HFTextConfig {
     max_position_embeddings: Option<usize>,
 
     /// number of layers in the model
-    num_hidden_layers: usize,
+    /// Optional because some multimodal models (e.g., LLaVA) don't include this in text_config
+    num_hidden_layers: Option<usize>,
 
     /// number of attention heads in the model
     num_attention_heads: Option<usize>,
@@ -701,11 +702,32 @@ impl HFConfig {
             })
             .or_else(|| {
                 // Maybe it's in generation_config.json
-                crate::file_json_field(&gencfg_path, "eos_token_id")
+                crate::file_json_field::<serde_json::Value>(&gencfg_path, "eos_token_id")
                 .inspect_err(
                     |err| tracing::warn!(%err, "Missing eos_token_id in generation_config.json"),
                 )
                 .ok()
+                .and_then(|v| {
+                    if v.is_number() {
+                        v.as_number()
+                            .and_then(|n| n.as_u64())
+                            .map(|n| vec![n as TokenIdType])
+                    } else if v.is_array() {
+                        let arr = v.as_array().unwrap();
+                        Some(
+                            arr.iter()
+                                .filter_map(|inner_v| {
+                                    inner_v
+                                        .as_number()
+                                        .and_then(|n| n.as_u64())
+                                        .map(|n| n as TokenIdType)
+                                })
+                                .collect(),
+                        )
+                    } else {
+                        None
+                    }
+                })
             })
             .ok_or_else(|| {
                 anyhow::anyhow!(
diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
index 82a63a6bf09..5382562fa3c 100644
--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -9,6 +9,67 @@ use either::Either;
 use minijinja::{Environment, Value};
 use tracing;
 
+/// Replace non-standard Jinja2 block tags with placeholders
+///
+/// minijinja doesn't expose its tag list publicly - they're hardcoded in a private match statement
+/// in the parser. This list is derived from minijinja v2.12.0's parser.rs implementation.
+/// See: https://github.com/mitsuhiko/minijinja/blob/main/minijinja/src/compiler/parser.rs#L542
+fn replace_non_standard_blocks(template: &str) -> String {
+    use regex::Regex;
+
+    // Standard Jinja2/minijinja tags (cannot be queried from minijinja API)
+    let standard_keywords = [
+        "for",
+        "endfor",
+        "if",
+        "elif",
+        "else",
+        "endif",
+        "block",
+        "endblock",
+        "extends",
+        "include",
+        "import",
+        "from",
+        "macro",
+        "endmacro",
+        "call",
+        "endcall",
+        "set",
+        "endset",
+        "with",
+        "endwith",
+        "filter",
+        "endfilter",
+        "autoescape",
+        "endautoescape",
+        "raw",
+        "endraw",
+        "do",
+    ];
+
+    let re = Regex::new(r"\{%\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*%\}").unwrap();
+    let mut result = template.to_string();
+    let mut replacements = Vec::new();
+
+    for cap in re.captures_iter(template) {
+        let full_match = cap.get(0).unwrap().as_str();
+        let tag_name = cap.get(1).unwrap().as_str();
+
+        if !standard_keywords.contains(&tag_name) {
+            // Non-standard tag (e.g., vLLM's {% generation %}) - replace with placeholder
+            let placeholder = format!("__JINJA_BLOCK_{}", tag_name.to_uppercase());
+            replacements.push((full_match.to_string(), placeholder));
+        }
+    }
+
+    for (original, placeholder) in replacements {
+        result = result.replace(&original, &placeholder);
+    }
+
+    result
+}
+
 impl JinjaEnvironment {
     fn env(self) -> Environment<'static> {
         self.env
@@ -64,8 +125,12 @@ impl HfTokenizerConfigJsonFormatter {
                     );
                     supports_add_generation_prompt = Some(true);
                 }
-                env.add_template_owned("default", x.to_string())?;
-                env.add_template_owned("tool_use", x.to_string())?;
+                // Replace non-standard Jinja2 block tags with placeholders for minijinja validation
+                // Standard Jinja2/minijinja blocks: for, if, block, macro, call, filter, set, with, autoescape, trans
+                // Any other {% tag %} blocks are likely backend-specific extensions (like vLLM's {% generation %})
+                let template_for_validation = replace_non_standard_blocks(x);
+                env.add_template_owned("default", template_for_validation.clone())?;
+                env.add_template_owned("tool_use", template_for_validation)?;
             }
             Either::Right(map) => {
                 for t in map {
@@ -87,7 +152,9 @@ impl HfTokenizerConfigJsonFormatter {
                         } else {
                             supports_add_generation_prompt = Some(false);
                         }
-                        env.add_template_owned(k.to_string(), v.to_string())?;
+                        // Replace non-standard Jinja2 block tags with placeholders for minijinja validation
+                        let template_for_validation = replace_non_standard_blocks(v);
+                        env.add_template_owned(k.to_string(), template_for_validation)?;
                     }
                 }
                 if env.templates().count() == 0 {

From d5eaec9c00087f9512dca46a040ebc582bc7c5a5 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 17 Nov 2025 04:31:50 -0800
Subject: [PATCH 2/7] Handle chat templates that expect message[content] be a
 string

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>
---
 lib/llm/src/preprocessor/prompt/template.rs   |   1 +
 .../prompt/template/formatters.rs             |  47 ++++++
 .../src/preprocessor/prompt/template/oai.rs   | 148 ++++++++++++++----
 3 files changed, 168 insertions(+), 28 deletions(-)

diff --git a/lib/llm/src/preprocessor/prompt/template.rs b/lib/llm/src/preprocessor/prompt/template.rs
index 87748046ae0..8ea6f714942 100644
--- a/lib/llm/src/preprocessor/prompt/template.rs
+++ b/lib/llm/src/preprocessor/prompt/template.rs
@@ -106,6 +106,7 @@ struct HfTokenizerConfigJsonFormatter {
     config: ChatTemplate,
     mixins: Arc<ContextMixins>,
     supports_add_generation_prompt: bool,
+    requires_content_arrays: bool,
 }
 
 // /// OpenAI Standard Prompt Formatter
diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
index 5382562fa3c..edb93eb8963 100644
--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -70,6 +70,44 @@ fn replace_non_standard_blocks(template: &str) -> String {
     result
 }
 
+/// Detects whether a chat template requires message content as arrays (multimodal)
+/// or accepts simple strings (standard text-only templates).
+///
+/// This function test-renders the template with both formats:
+/// - Array format: `[{"type": "text", "text": "X"}]`
+/// - String format: `"X"`
+///
+/// If the array format works but string format doesn't produce output,
+/// the template requires arrays (e.g., llava, Qwen-VL multimodal templates).
+fn detect_content_array_usage(env: &Environment) -> bool {
+    use minijinja::context;
+    use serde_json::json;
+
+    // Test with array format
+    let test_array = context! {
+        messages => json!([{"role": "user", "content": [{"type": "text", "text": "X"}]}]),
+        add_generation_prompt => false,
+    };
+
+    // Test with string format
+    let test_string = context! {
+        messages => json!([{"role": "user", "content": "X"}]),
+        add_generation_prompt => false,
+    };
+
+    let out_array = env
+        .get_template("default")
+        .and_then(|t| t.render(&test_array))
+        .unwrap_or_default();
+    let out_string = env
+        .get_template("default")
+        .and_then(|t| t.render(&test_string))
+        .unwrap_or_default();
+
+    // If array works but string doesn't, template requires arrays
+    out_array.contains("X") && !out_string.contains("X")
+}
+
 impl JinjaEnvironment {
     fn env(self) -> Environment<'static> {
         self.env
@@ -165,11 +203,20 @@ impl HfTokenizerConfigJsonFormatter {
             }
         }
 
+        // Detect at model load time whether this template requires content arrays
+        let requires_content_arrays = detect_content_array_usage(&env);
+
+        tracing::info!(
+            "Template analysis: requires_content_arrays = {}",
+            requires_content_arrays
+        );
+
         Ok(HfTokenizerConfigJsonFormatter {
             env,
             config,
             mixins: Arc::new(mixins),
             supports_add_generation_prompt: supports_add_generation_prompt.unwrap_or(false),
+            requires_content_arrays,
         })
     }
 }
diff --git a/lib/llm/src/preprocessor/prompt/template/oai.rs b/lib/llm/src/preprocessor/prompt/template/oai.rs
index 9d02e3a6980..4de38a020e5 100644
--- a/lib/llm/src/preprocessor/prompt/template/oai.rs
+++ b/lib/llm/src/preprocessor/prompt/template/oai.rs
@@ -73,10 +73,11 @@ fn may_be_fix_tool_schema(tools: serde_json::Value) -> Option<Value> {
     Some(Value::from_serialize(&updated_tools))
 }
 
-fn may_be_fix_msg_content(messages: serde_json::Value) -> Value {
-    // If messages[content] is provided as a list containing ONLY text parts,
-    // concatenate them into a string to match chat template expectations.
-    // Mixed content types are left for chat templates to handle.
+fn may_be_fix_msg_content(messages: serde_json::Value, preserve_arrays: bool) -> Value {
+    // Bidirectional normalization for message content format:
+    // - If preserve_arrays=true (multimodal templates): Convert strings → arrays
+    // - If preserve_arrays=false (standard templates): Flatten text-only arrays → strings
+    // - Mixed content types are always preserved as-is
 
     let Some(arr) = messages.as_array() else {
         return Value::from_serialize(&messages);
@@ -86,7 +87,20 @@ fn may_be_fix_msg_content(messages: serde_json::Value) -> Value {
         .iter()
         .map(|msg| {
             match msg.get("content") {
-                Some(serde_json::Value::Array(content_array)) => {
+                // Case 1: String → Array (for multimodal templates)
+                Some(serde_json::Value::String(text)) if preserve_arrays => {
+                    let mut modified_msg = msg.clone();
+                    if let Some(msg_object) = modified_msg.as_object_mut() {
+                        let content_array = serde_json::json!([{
+                            "type": "text",
+                            "text": text
+                        }]);
+                        msg_object.insert("content".to_string(), content_array);
+                    }
+                    modified_msg
+                }
+                // Case 2: Array → String (for standard templates)
+                Some(serde_json::Value::Array(content_array)) if !preserve_arrays => {
                     let is_text_only_array = !content_array.is_empty()
                         && content_array.iter().all(|part| {
                             part.get("type")
@@ -159,19 +173,7 @@ impl OAIChatLikeRequest for NvCreateChatCompletionRequest {
 
     fn messages(&self) -> Value {
         let messages_json = serde_json::to_value(&self.inner.messages).unwrap();
-
-        let needs_fixing = if let Some(arr) = messages_json.as_array() {
-            arr.iter()
-                .any(|msg| msg.get("content").and_then(|c| c.as_array()).is_some())
-        } else {
-            false
-        };
-
-        if needs_fixing {
-            may_be_fix_msg_content(messages_json)
-        } else {
-            Value::from_serialize(&messages_json)
-        }
+        Value::from_serialize(&messages_json)
     }
 
     fn tools(&self) -> Option<Value> {
@@ -301,6 +303,15 @@ impl OAIPromptFormatter for HfTokenizerConfigJsonFormatter {
         let messages_canonical = req.messages();
         let mut messages_for_template: serde_json::Value =
             serde_json::to_value(&messages_canonical).unwrap();
+
+        // Apply bidirectional content normalization based on template requirements
+        let preserve_arrays = self.requires_content_arrays;
+        messages_for_template = serde_json::to_value(may_be_fix_msg_content(
+            messages_for_template,
+            preserve_arrays,
+        ))
+        .unwrap();
+
         normalize_tool_arguments_in_messages(&mut messages_for_template);
 
         let ctx = context! {
@@ -457,7 +468,10 @@ mod tests {
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Test array → string normalization (preserve_arrays=false for standard templates)
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Verify: text-only array is concatenated into a single string
         assert_eq!(
@@ -500,7 +514,10 @@ mod tests {
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Test array → string normalization (preserve_arrays=false for standard templates)
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Verify: System message with string content remains unchanged
         assert_eq!(
@@ -541,7 +558,10 @@ mod tests {
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Empty arrays should be preserved regardless of preserve_arrays setting
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Verify: Empty arrays are preserved as-is
         assert!(messages[0]["content"].is_array());
@@ -562,7 +582,10 @@ mod tests {
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Test with preserve_arrays=false (standard templates)
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Verify: String content is not modified
         assert_eq!(
@@ -589,7 +612,10 @@ mod tests {
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Mixed content should be preserved regardless of preserve_arrays setting
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Verify: Mixed content types are preserved as array for template handling
         assert!(messages[0]["content"].is_array());
@@ -617,7 +643,10 @@ mod tests {
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Non-text arrays should be preserved regardless of preserve_arrays setting
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Verify: Non-text content arrays are preserved for template handling
         assert!(messages[0]["content"].is_array());
@@ -713,7 +742,8 @@ NORMAL MODE
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Mixed types should preserve array structure
         assert!(messages[0]["content"].is_array());
@@ -735,7 +765,8 @@ NORMAL MODE
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         // Unknown types mixed with text should preserve array
         assert!(messages[0]["content"].is_array());
@@ -873,11 +904,15 @@ NORMAL MODE
         }"#;
 
         let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
-        let mut messages = serde_json::to_value(request.messages()).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Apply content normalization with preserve_arrays=false (standard templates)
+        let mut messages =
+            serde_json::to_value(may_be_fix_msg_content(messages_raw, false)).unwrap();
 
         normalize_tool_arguments_in_messages(&mut messages);
 
-        // Multimodal content preserved as array
+        // Multimodal content preserved as array (mixed types not flattened)
         assert!(messages[0]["content"].is_array());
         assert_eq!(messages[0]["content"].as_array().unwrap().len(), 3);
 
@@ -889,6 +924,63 @@ NORMAL MODE
         );
     }
 
+    /// Tests string → array normalization for multimodal templates
+    #[test]
+    fn test_may_be_fix_msg_content_string_to_array() {
+        let json_str = r#"{
+            "model": "gpt-4o",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "Hello, how are you?"
+                }
+            ]
+        }"#;
+
+        let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Test with preserve_arrays=true (multimodal templates)
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, true)).unwrap();
+
+        // Verify: String is converted to array format
+        assert!(messages[0]["content"].is_array());
+        let content_array = messages[0]["content"].as_array().unwrap();
+        assert_eq!(content_array.len(), 1);
+        assert_eq!(content_array[0]["type"], "text");
+        assert_eq!(content_array[0]["text"], "Hello, how are you?");
+    }
+
+    /// Tests that arrays are preserved when preserve_arrays=true
+    #[test]
+    fn test_may_be_fix_msg_content_array_preserved_with_multimodal() {
+        let json_str = r#"{
+            "model": "gpt-4o",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "part 1"},
+                        {"type": "text", "text": "part 2"}
+                    ]
+                }
+            ]
+        }"#;
+
+        let request: NvCreateChatCompletionRequest = serde_json::from_str(json_str).unwrap();
+        let messages_raw = serde_json::to_value(request.messages()).unwrap();
+
+        // Test with preserve_arrays=true (multimodal templates)
+        let messages = serde_json::to_value(may_be_fix_msg_content(messages_raw, true)).unwrap();
+
+        // Verify: Array is preserved as-is
+        assert!(messages[0]["content"].is_array());
+        let content_array = messages[0]["content"].as_array().unwrap();
+        assert_eq!(content_array.len(), 2);
+        assert_eq!(content_array[0]["text"], "part 1");
+        assert_eq!(content_array[1]["text"], "part 2");
+    }
+
     fn user() -> Msg {
         Msg::User(Default::default())
     }

From 7e662ce7399f66f2d2deec1075ccd4257352a8c9 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 17 Nov 2025 04:59:12 -0800
Subject: [PATCH 3/7] Cleaning up comments + variable names + structure

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>
---
 .../prompt/template/formatters.rs             | 25 +++++++++----------
 .../src/preprocessor/prompt/template/oai.rs   |  4 +--
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
index edb93eb8963..03298ec9f3b 100644
--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -6,8 +6,9 @@ use std::sync::Arc;
 use super::tokcfg::{ChatTemplate, raise_exception, strftime_now, tojson};
 use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment};
 use either::Either;
-use minijinja::{Environment, Value};
+use minijinja::{Environment, Value, context};
 use tracing;
+use serde_json::json;
 
 /// Replace non-standard Jinja2 block tags with placeholders
 ///
@@ -74,38 +75,36 @@ fn replace_non_standard_blocks(template: &str) -> String {
 /// or accepts simple strings (standard text-only templates).
 ///
 /// This function test-renders the template with both formats:
-/// - Array format: `[{"type": "text", "text": "X"}]`
-/// - String format: `"X"`
+/// - Array format: `[{"type": "text", "text": "template_test"}]`
+/// - String format: `"template_test"`
 ///
 /// If the array format works but string format doesn't produce output,
-/// the template requires arrays (e.g., llava, Qwen-VL multimodal templates).
+/// the template requires arrays (multimodal templates).
 fn detect_content_array_usage(env: &Environment) -> bool {
-    use minijinja::context;
-    use serde_json::json;
 
     // Test with array format
-    let test_array = context! {
-        messages => json!([{"role": "user", "content": [{"type": "text", "text": "X"}]}]),
+    let array_msg = context! {
+        messages => json!([{"role": "user", "content": [{"type": "text", "text": "template_test"}]}]),
         add_generation_prompt => false,
     };
 
     // Test with string format
-    let test_string = context! {
-        messages => json!([{"role": "user", "content": "X"}]),
+    let string_msg = context! {
+        messages => json!([{"role": "user", "content": "template_test"}]),
         add_generation_prompt => false,
     };
 
     let out_array = env
         .get_template("default")
-        .and_then(|t| t.render(&test_array))
+        .and_then(|t| t.render(&array_msg))
         .unwrap_or_default();
     let out_string = env
         .get_template("default")
-        .and_then(|t| t.render(&test_string))
+        .and_then(|t| t.render(&string_msg))
         .unwrap_or_default();
 
     // If array works but string doesn't, template requires arrays
-    out_array.contains("X") && !out_string.contains("X")
+    out_array.contains("template_test") && !out_string.contains("template_test")
 }
 
 impl JinjaEnvironment {
diff --git a/lib/llm/src/preprocessor/prompt/template/oai.rs b/lib/llm/src/preprocessor/prompt/template/oai.rs
index 4de38a020e5..d041ebf5ff3 100644
--- a/lib/llm/src/preprocessor/prompt/template/oai.rs
+++ b/lib/llm/src/preprocessor/prompt/template/oai.rs
@@ -87,7 +87,7 @@ fn may_be_fix_msg_content(messages: serde_json::Value, preserve_arrays: bool) ->
         .iter()
         .map(|msg| {
             match msg.get("content") {
-                // Case 1: String → Array (for multimodal templates)
+                // Case 1: String to Array (for multimodal templates)
                 Some(serde_json::Value::String(text)) if preserve_arrays => {
                     let mut modified_msg = msg.clone();
                     if let Some(msg_object) = modified_msg.as_object_mut() {
@@ -99,7 +99,7 @@ fn may_be_fix_msg_content(messages: serde_json::Value, preserve_arrays: bool) ->
                     }
                     modified_msg
                 }
-                // Case 2: Array → String (for standard templates)
+                // Case 2: Array to String (for standard templates)
                 Some(serde_json::Value::Array(content_array)) if !preserve_arrays => {
                     let is_text_only_array = !content_array.is_empty()
                         && content_array.iter().all(|part| {

From fdfc7dac33d0e0ec102ae78dc3c2613461bf8796 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Mon, 17 Nov 2025 06:14:29 -0800
Subject: [PATCH 4/7] fix: cargo_fmt fix

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>
---
 lib/llm/src/preprocessor/prompt/template/formatters.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
index 03298ec9f3b..225a341b5ad 100644
--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -7,8 +7,8 @@ use super::tokcfg::{ChatTemplate, raise_exception, strftime_now, tojson};
 use super::{ContextMixins, HfTokenizerConfigJsonFormatter, JinjaEnvironment};
 use either::Either;
 use minijinja::{Environment, Value, context};
-use tracing;
 use serde_json::json;
+use tracing;
 
 /// Replace non-standard Jinja2 block tags with placeholders
 ///
@@ -81,7 +81,6 @@ fn replace_non_standard_blocks(template: &str) -> String {
 /// If the array format works but string format doesn't produce output,
 /// the template requires arrays (multimodal templates).
 fn detect_content_array_usage(env: &Environment) -> bool {
-
     // Test with array format
     let array_msg = context! {
         messages => json!([{"role": "user", "content": [{"type": "text", "text": "template_test"}]}]),

From ef0165f2b6bdba6cb6b390d02b79d1c41655f1d8 Mon Sep 17 00:00:00 2001
From: Keiven Chang <keivenchang@users.noreply.github.com>
Date: Mon, 17 Nov 2025 23:38:11 +0000
Subject: [PATCH 5/7] fix: Handle two non Jinja2 tags in chat templates

- Add remove_known_non_jinja2_tags to strip {% generation %} tags before minijinja validation
- Fixes LLaVA and other models using vLLM-specific template extensions
- Make num_hidden_layers optional in model card parsing for multimodal compatibility
- Handle eos_token_id as single value or array in generation_config.json

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
---
 lib/llm/src/model_card.rs                     | 26 +++++++++-
 .../prompt/template/formatters.rs             | 52 +++++++++++++++++--
 2 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/lib/llm/src/model_card.rs b/lib/llm/src/model_card.rs
index 0674e1c0a16..3d6250ad420 100644
--- a/lib/llm/src/model_card.rs
+++ b/lib/llm/src/model_card.rs
@@ -621,7 +621,8 @@ struct HFTextConfig {
     max_position_embeddings: Option<usize>,
 
     /// number of layers in the model
-    num_hidden_layers: usize,
+    /// Optional because some multimodal models (e.g., LLaVA) don't include this in text_config
+    num_hidden_layers: Option<usize>,
 
     /// number of attention heads in the model
     num_attention_heads: Option<usize>,
@@ -701,11 +702,32 @@ impl HFConfig {
             })
             .or_else(|| {
                 // Maybe it's in generation_config.json
-                crate::file_json_field(&gencfg_path, "eos_token_id")
+                crate::file_json_field::<serde_json::Value>(&gencfg_path, "eos_token_id")
                 .inspect_err(
                     |err| tracing::warn!(%err, "Missing eos_token_id in generation_config.json"),
                 )
                 .ok()
+                .and_then(|v| {
+                    if v.is_number() {
+                        v.as_number()
+                            .and_then(|n| n.as_u64())
+                            .map(|n| vec![n as TokenIdType])
+                    } else if v.is_array() {
+                        let arr = v.as_array().unwrap();
+                        Some(
+                            arr.iter()
+                                .filter_map(|inner_v| {
+                                    inner_v
+                                        .as_number()
+                                        .and_then(|n| n.as_u64())
+                                        .map(|n| n as TokenIdType)
+                                })
+                                .collect(),
+                        )
+                    } else {
+                        None
+                    }
+                })
             })
             .ok_or_else(|| {
                 anyhow::anyhow!(
diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
index 82a63a6bf09..0289300ef69 100644
--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -9,6 +9,21 @@ use either::Either;
 use minijinja::{Environment, Value};
 use tracing;
 
+/// Remove known non-standard Jinja2 tags from chat templates
+///
+/// Some models use custom Jinja2 extensions that minijinja doesn't recognize. These tags
+/// are typically metadata markers that don't affect the rendered output. For example:
+/// - {% generation %} / {% endgeneration %}: Used by vLLM's AssistantTracker to mark
+///   assistant-generated content. The tags themselves don't produce output.
+///
+/// By removing these tags before validation, we allow templates with backend-specific
+/// extensions to work with minijinja while maintaining correct output semantics.
+fn remove_known_non_jinja2_tags(template: &str) -> String {
+    template
+        .replace("{% generation %}", "")
+        .replace("{% endgeneration %}", "")
+}
+
 impl JinjaEnvironment {
     fn env(self) -> Environment<'static> {
         self.env
@@ -64,8 +79,10 @@ impl HfTokenizerConfigJsonFormatter {
                     );
                     supports_add_generation_prompt = Some(true);
                 }
-                env.add_template_owned("default", x.to_string())?;
-                env.add_template_owned("tool_use", x.to_string())?;
+                // Remove known non-standard tags before validation (they don't affect output)
+                let template_cleaned = remove_known_non_jinja2_tags(x);
+                env.add_template_owned("default", template_cleaned.clone())?;
+                env.add_template_owned("tool_use", template_cleaned)?;
             }
             Either::Right(map) => {
                 for t in map {
@@ -87,7 +104,9 @@ impl HfTokenizerConfigJsonFormatter {
                         } else {
                             supports_add_generation_prompt = Some(false);
                         }
-                        env.add_template_owned(k.to_string(), v.to_string())?;
+                        // Remove known non-standard tags before validation (they don't affect output)
+                        let template_cleaned = remove_known_non_jinja2_tags(v);
+                        env.add_template_owned(k.to_string(), template_cleaned)?;
                     }
                 }
                 if env.templates().count() == 0 {
@@ -117,3 +136,30 @@ impl HfTokenizerConfigJsonFormatter {
 
 //     // fn apply_tool_template()
 // }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_remove_known_non_jinja2_tags() {
+        let template =
+            "USER: {{ message }} ASSISTANT: {% generation %}Reply here{% endgeneration %}";
+        let result = remove_known_non_jinja2_tags(template);
+        assert_eq!(result, "USER: {{ message }} ASSISTANT: Reply here");
+    }
+
+    #[test]
+    fn test_remove_known_non_jinja2_tags_preserves_standard_tags() {
+        let template = "{% for item in items %}{{ item }}{% endfor %}";
+        let result = remove_known_non_jinja2_tags(template);
+        assert_eq!(result, template);
+    }
+
+    #[test]
+    fn test_remove_known_non_jinja2_tags_multiple() {
+        let template = "Start {% generation %}Part 1{% endgeneration %} middle {% generation %}Part 2{% endgeneration %}";
+        let result = remove_known_non_jinja2_tags(template);
+        assert_eq!(result, "Start Part 1 middle Part 2");
+    }
+}

From 9b823049fff68084800bf0ea3c87c8d3f37111b2 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Tue, 18 Nov 2025 15:37:02 -0800
Subject: [PATCH 6/7] Fixing unintended changes

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>
---
 .../prompt/template/formatters.rs             | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/lib/llm/src/preprocessor/prompt/template/formatters.rs b/lib/llm/src/preprocessor/prompt/template/formatters.rs
index 86ab3c73b4c..9f5bcb981c7 100644
--- a/lib/llm/src/preprocessor/prompt/template/formatters.rs
+++ b/lib/llm/src/preprocessor/prompt/template/formatters.rs
@@ -25,6 +25,41 @@ fn remove_known_non_jinja2_tags(template: &str) -> String {
         .replace("{% endgeneration %}", "")
 }
 
+/// Detects whether a chat template requires message content as arrays (multimodal)
+/// or accepts simple strings (standard text-only templates).
+///
+/// This function test-renders the template with both formats:
+/// - Array format: `[{"type": "text", "text": "template_test"}]`
+/// - String format: `"template_test"`
+///
+/// If the array format works but string format doesn't produce output,
+/// the template requires arrays (multimodal templates).
+fn detect_content_array_usage(env: &Environment) -> bool {
+    // Test with array format
+    let array_msg = context! {
+        messages => json!([{"role": "user", "content": [{"type": "text", "text": "template_test"}]}]),
+        add_generation_prompt => false,
+    };
+
+    // Test with string format
+    let string_msg = context! {
+        messages => json!([{"role": "user", "content": "template_test"}]),
+        add_generation_prompt => false,
+    };
+
+    let out_array = env
+        .get_template("default")
+        .and_then(|t| t.render(&array_msg))
+        .unwrap_or_default();
+    let out_string = env
+        .get_template("default")
+        .and_then(|t| t.render(&string_msg))
+        .unwrap_or_default();
+
+    // If array works but string doesn't, template requires arrays
+    out_array.contains("template_test") && !out_string.contains("template_test")
+}
+
 impl JinjaEnvironment {
     fn env(self) -> Environment<'static> {
         self.env

From fbe3d6875f90963a66abf6abe5588d5a44a60fb6 Mon Sep 17 00:00:00 2001
From: Krishnan Prashanth <kprashanth@nvidia.com>
Date: Tue, 18 Nov 2025 15:47:47 -0800
Subject: [PATCH 7/7] Adding Llava test

Signed-off-by: Krishnan Prashanth <kprashanth@nvidia.com>
---
 tests/serve/test_vllm.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py
index d14ea486882..044d7a2c9d4 100644
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -217,6 +217,33 @@ class VLLMConfig(EngineConfig):
             ),
         ],
     ),
+    "multimodal_agg_llava": VLLMConfig(
+        name="multimodal_agg_llava",
+        directory=vllm_dir,
+        script_name="agg_multimodal.sh",
+        marks=[pytest.mark.gpu_2],
+        model="llava-hf/llava-1.5-7b-hf",
+        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
+        delayed_start=0,
+        timeout=360,
+        request_payloads=[
+            # HTTP URL test
+            chat_payload(
+                [
+                    {"type": "text", "text": "What is in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
+                        },
+                    },
+                ],
+                repeat_count=1,
+                expected_response=["bus"],
+                temperature=0.0,
+            ),
+        ],
+    ),
     # TODO: Update this test case when we have video multimodal support in vllm official components
     "multimodal_video_agg": VLLMConfig(
         name="multimodal_video_agg",