diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index e8fd2cef784..06e7edc2efb 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -203,12 +203,14 @@ def launch_mm_encoder_server( metadata_server_cfg: Optional[MetadataServerConfig] = None, ): model = encoder_args["model"] + encoder_args.pop("build_config") mm_encoder = MultimodalEncoder(**encoder_args) server = OpenAIServer(llm=mm_encoder, model=model, server_role=ServerRole.MM_ENCODER, - metadata_server_cfg=metadata_server_cfg) + metadata_server_cfg=metadata_server_cfg, + tool_parser=None) asyncio.run(server(host, port)) diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 3bca2c6eced..e98cfec22f6 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -49,6 +49,7 @@ l0_a10: - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-True-TinyLlama-1.1B-Chat-v1.0] - test_e2e.py::test_openai_chat_guided_decoding - test_e2e.py::test_openai_chat_multimodal_example ISOLATION + - test_e2e.py::test_openai_mmencoder_example - test_e2e.py::test_openai_perf_metrics - test_e2e.py::test_openai_prometheus - test_e2e.py::test_openai_lora