diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 2e0037bcfc5..3597a6baa3e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -43,7 +43,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bf
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
-test_e2e.py::test_openai_chat_structural_tag_example SKIP (https://nvbugspro.nvidia.com/bug/5375594)
cpp/test_e2e.py::test_model[fp8-chatglm-90] SKIP (https://nvbugs/5034830)
full:B200_PCIe/unittest/trt/functional SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization SKIP (Disable for Blackwell)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_json.py b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
index a444b5566b8..53651828507 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_json.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
@@ -26,11 +26,7 @@ def temp_extra_llm_api_options_file(request):
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
- extra_llm_api_options_dict = {
- "guided_decoding_backend": "xgrammar",
- "disable_overlap_scheduler":
- True, # Guided decoding is not supported with overlap scheduler
- }
+ extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
with open(temp_file_path, "w") as f:
yaml.dump(extra_llm_api_options_dict, f)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index e3411404947..022b5a89863 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -1,25 +1,28 @@
# Adapted from
# https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_chat.py
+import json
import os
+import re
import tempfile
+import jsonschema
import openai
import pytest
import yaml
-from ..test_llm import get_model_path, similar
+from ..test_llm import get_model_path
from .openai_server import RemoteOpenAIServer
pytestmark = pytest.mark.threadleak(enabled=False)
-@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
+@pytest.fixture(scope="module")
def model_name():
return "llama-3.1-model/Llama-3.1-8B-Instruct"
@pytest.fixture(scope="module")
-def temp_extra_llm_api_options_file(request):
+def temp_extra_llm_api_options_file():
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
try:
@@ -37,7 +40,12 @@ def temp_extra_llm_api_options_file(request):
@pytest.fixture(scope="module")
def server(model_name: str, temp_extra_llm_api_options_file: str):
model_path = get_model_path(model_name)
- args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
+
+ # Use small max_batch_size/max_seq_len/max_num_tokens to avoid OOM on A10/A30 GPUs.
+ args = [
+ "--max_batch_size=8", "--max_seq_len=1024", "--max_num_tokens=1024",
+ f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
+ ]
with RemoteOpenAIServer(model_path, args) as remote_server:
yield remote_server
@@ -112,12 +120,7 @@ def tool_get_current_date():
def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
tool_get_current_weather, tool_get_current_date):
- messages = [
- {
- "role":
- "system",
- "content":
- f"""
+ system_prompt = f"""
# Tool Instructions
- Always execute python code in messages that you share.
- When looking for real time information use relevant functions if available else fallback to brave_search
@@ -140,20 +143,24 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
- Only call one function at a time
- Put the entire function call reply on one line
- Always add your sources when using search results to answer the user query
-You are a helpful assistant.""",
+You are a helpful assistant."""
+ user_prompt = "You are in New York. Please get the current date and time, and the weather."
+
+ messages = [
+ {
+ "role": "system",
+ "content": system_prompt,
},
{
- "role":
- "user",
- "content":
- "You are in New York. Please get the current date and time, and the weather.",
+ "role": "user",
+ "content": user_prompt,
},
]
chat_completion = client.chat.completions.create(
model=model_name,
messages=messages,
- max_completion_tokens=100,
+ max_completion_tokens=256,
response_format={
"type":
"structural_tag",
@@ -173,11 +180,18 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
"triggers": ["([\S\s]+?)',
+ message.content)
+ params = json.loads(match.group(1))
+ jsonschema.validate(params,
+ tool_get_current_weather["function"]["parameters"])
+
+ match = re.search(r'([\S\s]+?)',
+ message.content)
+ params = json.loads(match.group(1))
+ jsonschema.validate(params, tool_get_current_date["function"]["parameters"])