diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index c032c80757c..d5fa982a37a 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -2043,10 +2043,13 @@ void KVCacheManager::addSequence( void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest) { auto const requestId = llmRequest.mRequestId; - auto& sequence = getSequence(requestId); - if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest()) + if (mSequences.find(requestId) != mSequences.end()) { - mBlockManager.storeContextBlocks(sequence, llmRequest); + auto& sequence = getSequence(requestId); + if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest()) + { + mBlockManager.storeContextBlocks(sequence, llmRequest); + } } } diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt index aa5b3cf45da..af657a625e2 100755 --- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt +++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt @@ -52,6 +52,6 @@ if(NOT WIN32) ${TRTLLM_NB_MODULE} PROPERTIES LINK_FLAGS - "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" + "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" ) endif() diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt index b4809d5135e..bb1d87f9d4b 100755 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -53,6 +53,6 @@ if(NOT WIN32) ${TRTLLM_PYBIND_MODULE} PROPERTIES LINK_FLAGS - "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" + "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" ) endif() diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index c832481da9f..eeafc8f4a65 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" # Install OpenCV with FFMPEG support -RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/ -RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir +RUN pip3 uninstall -y opencv && \ + rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \ + pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir # WARs against security issues inherited from pytorch:25.06 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 diff --git a/examples/models/core/kimi_k2/README.md b/examples/models/core/kimi_k2/README.md new file mode 100644 index 00000000000..1dd3e353c5a --- /dev/null +++ b/examples/models/core/kimi_k2/README.md @@ -0,0 +1,127 @@ +# K2 (Kimi-K2-Instruct) + +## Overview + +Kimi K2 is Moonshot AI's Mixture-of-Experts model with 32 billion activated parameters and 1 trillion total parameters. It achieves state-of-the-art performance in frontier knowledge, math, and coding among non-thinking models. Notably, K2 also excels in agentic capabilities, demonstrating outstanding performance across complex, multi-step tasks. + +## Prerequisites for Tool Calling in Kimi-K2 + +K2 model supports tool calling functionality. The official guide can be found at: [tool_call_guidance](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md) + +As described in the official guide, a tool calling process in Kimi-K2 includes: +1. Passing function descriptions to Kimi-K2. +2. Kimi-K2 decides to make a function call and returns the necessary information for the function call to the user. +3. The user performs the function call, collects the call results, and passes the function call results to Kimi-K2 +4. Kimi-K2 continues to generate content based on the function call results until the model believes it has obtained sufficient information to respond to the user + +Tools are the primary way to define callable functions for K2. Each tool requires: +- A unique name +- A clear description +- A JSON schema defining the expected parameters + +A possible example of tool description(you may refer to [Using tools](https://huggingface.co/docs/hugs/guides/function-calling) for more information) is as follows: +```python +# Collect the tool descriptions in tools +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather information. Call this tool when the user needs to get weather information", + "parameters": { + "type": "object", + "required": ["location"], + "properties": { + "location": { + "type": "string", + "description": "location name", + } + } + } + } +}] +``` + +Kimi currently supports two main approaches for tool calling: +1. *Use openai.OpenAI to send messages to Kimi-K2 together with tool descriptions.* +In this mode, the descriptions of the tools are passed as an argument to `client.chat.completions.create`, and the tool-call details can be read directly from the corresponding fields in the response. +2. *Manually parse the tool-call requests from the outputs generated by Kimi-K2.* +The tool call requests generated by Kimi-K2 are wrapped by <|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.{func_name}:{idx}, from which we can parse the function name. + +**Note that TensorRT-LLM does not support the first approach for now. If you deploy K2 with TensorRT-LLM, you need to manually parse the tool-call requests from the outputs.** + +The next section is an example that deploys the K2 model using TensorRT-LLM and then manually parses the tool-call results. + +## Example: Manually Parsing Tool-Call Requests from Kimi-K2 Outputs + +First, launch a server using trtllm-serve: + +```bash +cat > ./extra_llm_api_options.yaml < +<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "shanghai"}<|tool_call_end|> +<|tool_calls_section_end|>user + +[The tool-call requests parsed from the output]: [{'id': 'functions.get_weather:0', 'type': 'function', 'function': {'name': 'get_weather', 'arguments': '{"location": "shanghai"}'}}] + +[Tool call result]: tool_name=get_weather, tool_result=Cloudy +``` + +The tool call works successfully: +- In `[The original output from Kimi-K2]`, the LLM selects the correct tool `get_weather` and provides the appropriate arguments. +- In `[The tool-call requests parsed from the output]`, the client parses the LLM response. +- In `[Tool call result]`, the client executes the tool function and get the result. + +Let's try another query, "What's the weather like in beijing today?", using a predefined system prompt to specify the output format as shown below. + +```bash +python kimi_k2_tool_calling_example.py \ + --model "moonshotai/Kimi-K2-Instruct" \ + --prompt "What's the weather like in beijing today?" + --specify_output_format +``` + +The output would look like: + +```txt +[The original output from Kimi-K2]: [get_weather(location='beijing')]user + +[The tool-call requests parsed from the output]: [{'type': 'function', 'function': {'name': 'get_weather', 'arguments': {'location': 'beijing'}}}] + +[Tool call result]: tool_name=get_weather, tool_result=Sunny +``` +Once again, the tool call works successfully and the original output from Kimi-K2 is formatted. + +**Note that, without guided decoding or other deterministic tool adapters, K2 sometimes deviates from the specified output format. Because TensorRT-LLM does not support K2 with guided decoding for now, you have to parse the tool calls carefully from the raw model output to ensure they meet the required format.** diff --git a/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py new file mode 100644 index 00000000000..28505477041 --- /dev/null +++ b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py @@ -0,0 +1,201 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import ast +import json +import re + +from openai import OpenAI + +SPECIFY_OUTPUT_FORMAT_PROMPT = """You are an AI assistant with the role name "assistant." \ +Based on the provided API specifications and conversation history from steps 1 to t, \ +generate the API requests that the assistant should call in step t+1. \ +The API requests should be output in the format [api_name(key1='value1', key2='value2', ...)], \ +replacing api_name with the actual API name, key1, key2, etc., with the actual parameter names, \ +and value1, value2, etc., with the actual parameter values. The output should start with a square bracket "[" and end with a square bracket "]". +If there are multiple API requests, separate them with commas, for example: \ +[api_name(key1='value1', key2='value2', ...), api_name(key1='value1', key2='value2', ...), ...]. \ +Do not include any other explanations, prompts, or API call results in the output. +If the API parameter description does not specify otherwise, the parameter is optional \ +(parameters mentioned in the user input need to be included in the output; if not mentioned, they do not need to be included). +If the API parameter description does not specify the required format for the value, use the user's original text for the parameter value. \ +If the API requires no parameters, output the API request directly in the format [api_name()], and do not invent any nonexistent parameter names. + +API Specifications: +{tools}""" + +NOT_SPECIFY_OUTPUT_FORMAT_PROMPT = """Important: Only give the tool call requests, \ +do not include any other explanations, prompts, or API call results in the output. +The tool call requests generated by you are wrapped by \ +<|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. \ +The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.func_name:idx, \ +from which we can parse the function name. + +API Specifications: +{tools}""" + + +def get_weather(location: str): + if location.lower() == "beijing": + return "Sunny" + elif location.lower() == "shanghai": + return "Cloudy" + else: + return "Rainy" + + +# Tool name->object mapping for easy calling later +tool_map = {"get_weather": get_weather} + + +# ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md +def extract_tool_call_info(tool_call_rsp: str): + if '<|tool_calls_section_begin|>' not in tool_call_rsp: + # No tool calls + return [] + pattern = r"<\|tool_calls_section_begin\|>(.*?)<\|tool_calls_section_end\|>" + + tool_calls_sections = re.findall(pattern, tool_call_rsp, re.DOTALL) + + # Extract multiple tool calls + func_call_pattern = r"<\|tool_call_begin\|>\s*(?P[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*?)\s*<\|tool_call_end\|>" + tool_calls = [] + for match in re.findall(func_call_pattern, tool_calls_sections[0], + re.DOTALL): + function_id, function_args = match + # function_id: functions.get_weather:0 + function_name = function_id.split('.')[1].split(':')[0] + tool_calls.append({ + "id": function_id, + "type": "function", + "function": { + "name": function_name, + "arguments": function_args + } + }) + return tool_calls + + +def parse_specified_format_tool_calls(text: str): + pattern = re.compile(r'(\w+)\s*\(([^)]*)\)') + tool_calls = [] + + for m in pattern.finditer(text): + api_name, kv_body = m.group(1), m.group(2) + + kv_pattern = re.compile(r'(\w+)\s*=\s*([^,]+)') + kwargs = {} + for k, v in kv_pattern.findall(kv_body): + try: + kwargs[k] = ast.literal_eval(v.strip()) + except Exception: + kwargs[k] = v.strip() + + tool_calls.append({ + "type": "function", + "function": { + "name": api_name, + "arguments": kwargs + } + }) + + return tool_calls + + +def get_tools(): + # Collect the tool descriptions in tools + return [{ + "type": "function", + "function": { + "name": "get_weather", + "description": + "Get weather information. Call this tool when the user needs to get weather information", + "parameters": { + "type": "object", + "required": ["location"], + "properties": { + "location": { + "type": "string", + "description": "Location name", + } + } + } + } + }] + + +def get_tool_call_requests(args, client): + model = args.model + tools = get_tools() + system_prompt = SPECIFY_OUTPUT_FORMAT_PROMPT if args.specify_output_format else NOT_SPECIFY_OUTPUT_FORMAT_PROMPT.format( + tools=tools) + messages = [{ + "role": "system", + "content": system_prompt + }, { + "role": "user", + "content": args.prompt + }] + + response = client.chat.completions.create(model=model, + messages=messages, + max_tokens=256, + temperature=0.0) + + output = response.choices[0].message.content + tool_calls = parse_specified_format_tool_calls( + output) if args.specify_output_format else extract_tool_call_info( + output) + print(f"[The original output from Kimi-K2]: {output}\n") + print(f"[The tool-call requests parsed from the output]: {tool_calls}\n") + return tool_calls, messages + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", + type=str, + default="moonshotai/Kimi-K2-Instruct") + parser.add_argument("--prompt", + type=str, + default="What's the weather like in Shanghai today?") + parser.add_argument("--specify_output_format", + action="store_true", + default=False) + + args = parser.parse_args() + + # start trt-llm server before running this script + client = OpenAI( + api_key="tensorrt_llm", + base_url="http://localhost:8000/v1", + ) + + tool_calls, messages = get_tool_call_requests(args, client) + + for tool_call in tool_calls: + tool_name = tool_call['function']['name'] + if args.specify_output_format: + tool_arguments = tool_call['function']['arguments'] + else: + tool_arguments = json.loads(tool_call['function']['arguments']) + tool_function = tool_map[tool_name] + tool_result = tool_function(**tool_arguments) + print( + f"[Tool call result]: tool_name={tool_name}, tool_result={tool_result}\n" + ) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 95522b2bf26..d00dd66d534 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -591,6 +591,12 @@ def getMergeRequestChangedFileList(pipeline, globalVars) { } def getMergeRequestOneFileChanges(pipeline, globalVars, filePath) { + def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/) + if (env.alternativeTRT || isOfficialPostMergeJob) { + pipeline.echo("Force set changed file diff to empty string.") + return "" + } + def githubPrApiUrl = globalVars[GITHUB_PR_API_URL] def diff = "" diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 52abdbcb844..3041e684c96 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -27,7 +27,7 @@ from shutil import copy, copytree, rmtree from subprocess import DEVNULL, CalledProcessError, check_output, run from textwrap import dedent -from typing import List +from typing import Sequence try: from packaging.requirements import Requirement @@ -120,7 +120,8 @@ def create_venv(project_dir: Path): return venv_prefix -def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool): +def setup_venv(project_dir: Path, requirements_file: Path, + no_venv: bool) -> tuple[Path, Path]: """Creates/updates a venv and installs requirements. Args: @@ -279,6 +280,103 @@ def generate_fmha_cu(project_dir, venv_python): os.chdir(project_dir) +def create_cuda_stub_links(cuda_stub_dir: str): + """ + Creates symbolic links for CUDA stub libraries in the provided directory. + + Args: + cuda_stub_dir (str): Path to the directory containing CUDA stubs. + """ + cuda_stub_path = Path(cuda_stub_dir) + if not cuda_stub_path.exists(): + raise RuntimeError( + f"CUDA stub directory '{cuda_stub_dir}' does not exist.") + + shared_objects = ["cuda.so", + "nvidia-ml.so"] # List of shared object names to process. + + for lib_name in shared_objects: + # Define the full paths for the library (.so) and its versioned link (.so.1). + so = cuda_stub_path / f"lib{lib_name}" # e.g., libcuda.so + so_versioned = cuda_stub_path / f"lib{lib_name}.1" # e.g., libcuda.so.1 + + # Check if the library exists and the versioned link does not. + if so.exists() and not so_versioned.exists(): + try: + # Attempt to create the symbolic link. + so_versioned.symlink_to(so) + except PermissionError: + # Handle permission errors by attempting to use `sudo` to create the link. + try: + build_run(f"sudo ln -s {str(so)} {str(so_versioned)}") + except CalledProcessError as sudo_error: + print( + f"Failed to create symbolic link even with sudo: {sudo_error}" + ) + + +def generate_python_stubs_linux(binding_type: str, venv_python: Path, + deep_ep: bool): + is_nanobind = binding_type == "nanobind" + package = "nanobind" if is_nanobind else "pybind11-stubgen" + build_run(f"\"{venv_python}\" -m pip install {package}") + + env_stub_gen = os.environ.copy() + cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get( + "CUDA_PATH") or "/usr/local/cuda" + cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs" + ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH") + if Path(cuda_stub_dir).exists(): + # Create symbolic links for the CUDA stubs + create_cuda_stub_links(cuda_stub_dir) + env_stub_gen[ + "LD_LIBRARY_PATH"] = f"{ld_library_path}:{cuda_stub_dir}" if ld_library_path else cuda_stub_dir + if is_nanobind: + build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .", + env=env_stub_gen) + else: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", + env=env_stub_gen) + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code", + env=env_stub_gen) + if deep_ep: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", + env=env_stub_gen) + + +def generate_python_stubs_windows(binding_type: str, venv_python: Path, + pkg_dir: Path, lib_dir: Path): + if binding_type == "nanobind": + print("Windows not yet supported for nanobind stubs") + exit(1) + else: + build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen") + stubgen = "stubgen.py" + stubgen_contents = """ + # Loading torch, trt before bindings is required to avoid import errors on windows. + # isort: off + import torch + import tensorrt as trt + # isort: on + import os + import platform + + from pybind11_stubgen import main + + if __name__ == "__main__": + # Load dlls from `libs` directory before launching bindings. + if platform.system() == "Windows": + os.add_dll_directory(r\"{lib_dir}\") + main() + """.format(lib_dir=lib_dir) + (pkg_dir / stubgen).write_text(dedent(stubgen_contents)) + build_run(f"\"{venv_python}\" {stubgen} -o . bindings") + (pkg_dir / stubgen).unlink() + + def main(*, build_type: str = "Release", generator: str = "", @@ -286,7 +384,7 @@ def main(*, dist_dir: Path = None, cuda_architectures: str = None, job_count: int = None, - extra_cmake_vars: List[str] = list(), + extra_cmake_vars: Sequence[str] = tuple(), extra_make_targets: str = "", trt_root: str = '/usr/local/tensorrt', nccl_root: str = None, @@ -361,7 +459,7 @@ def main(*, if on_windows: # Windows does not support multi-device currently. - extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"]) + extra_cmake_vars += ["ENABLE_MULTI_DEVICE=0"] # The Ninja CMake generator is used for our Windows build # (Easier than MSBuild to make compatible with our Docker image) @@ -703,81 +801,14 @@ def get_binding_lib(subdirectory, name): dirs_exist_ok=True) if not skip_stubs: - with working_directory(project_dir): - if binding_type == "nanobind": - build_run(f"\"{venv_python}\" -m pip install nanobind") - else: - build_run( - f"\"{venv_python}\" -m pip install pybind11-stubgen") with working_directory(pkg_dir): if on_windows: - if binding_type == "nanobind": - print("Windows not yet supported for nanobind stubs") - exit(1) - else: - stubgen = "stubgen.py" - stubgen_contents = """ - # Loading torch, trt before bindings is required to avoid import errors on windows. - # isort: off - import torch - import tensorrt as trt - # isort: on - import os - import platform - - from pybind11_stubgen import main - - if __name__ == "__main__": - # Load dlls from `libs` directory before launching bindings. - if platform.system() == "Windows": - os.add_dll_directory(r\"{lib_dir}\") - main() - """.format(lib_dir=lib_dir) - (pkg_dir / stubgen).write_text(dedent(stubgen_contents)) - build_run(f"\"{venv_python}\" {stubgen} -o . bindings") - (pkg_dir / stubgen).unlink() - else: - env_ld = os.environ.copy() - - new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real" - if 'LD_LIBRARY_PATH' in env_ld: - new_library_path += f":{env_ld['LD_LIBRARY_PATH']}" - - result = build_run("find /usr -name *libnvidia-ml.so*", - capture_output=True, - text=True) - assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}" - - # Build containers only contain stub version of libnvidia-ml.so and not the real version. - # If real version not in system, we need to create symbolic link to stub version to prevent import errors. - if "libnvidia-ml.so.1" not in result.stdout: - if "libnvidia-ml.so" in result.stdout: - line = result.stdout.splitlines()[0] - path = os.path.dirname(line) - new_library_path += f":{path}" - build_run(f"ln -s {line} {path}/libnvidia-ml.so.1") - else: - print( - f"Failed to find libnvidia-ml.so: {result.stderr}", - file=sys.stderr) - exit(1) - - env_ld["LD_LIBRARY_PATH"] = new_library_path - if binding_type == "nanobind": - build_run( - f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .", - env=env_ld) - else: - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", - env=env_ld) - if deep_ep_cuda_architectures: - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", - env=env_ld) - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code", - env=env_ld) + generate_python_stubs_windows(binding_type, venv_python, + pkg_dir, lib_dir) + else: # on linux + generate_python_stubs_linux( + binding_type, venv_python, + bool(deep_ep_cuda_architectures)) if not skip_building_wheel: if dist_dir is None: diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index b08c106e7e1..89be7d40e35 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -492,6 +492,10 @@ def update_resources(self, scheduled_batch: ScheduledRequests): if request.py_rewind_len > 0: self.rewind_kv_cache(request, request.py_rewind_len) + # For context requests, we store the blocks for reuse. + for request in scheduled_batch.context_requests: + self.impl.store_context_blocks(request) + def free_resources(self, request: LlmRequest): self.impl.remove_sequence(request.py_request_id, request) diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py index 1aeb87554d9..303688f0017 100644 --- a/tensorrt_llm/serve/scripts/benchmark_serving.py +++ b/tensorrt_llm/serve/scripts/benchmark_serving.py @@ -581,7 +581,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: [results[k]] - for k in metrics}, + for k in metrics if k in results}, extra_info={ k: results[k] for k in results if k not in metrics and k not in ignored_metrics diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index e0801302eba..98432a3aab8 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -565,6 +565,40 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite" MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" + def test_nixl_backend(self): + ctx_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + gen_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + disaggregated_server_config = { + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "urls": ["localhost:8002"] + } + } + with launch_disaggregated_llm(disaggregated_server_config, + ctx_server_config, gen_server_config, + self.MODEL_PATH) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @parametrize_with_ids("overlap_scheduler", [True, False]) @parametrize_with_ids("mtp_nextn", [0, pytest.param(2, marks=skip_pre_hopper)]) @@ -666,6 +700,40 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen3/Qwen3-8B" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8" + def test_nixl_backend(self): + ctx_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + gen_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + disaggregated_server_config = { + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "urls": ["localhost:8002"] + } + } + with launch_disaggregated_llm(disaggregated_server_config, + ctx_server_config, gen_server_config, + self.MODEL_PATH) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.parametrize("overlap_scheduler", [False, True]) def test_auto_dtype(self, overlap_scheduler): ctx_server_config = { diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 2a961553905..c193a358197 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -14,11 +14,14 @@ # limitations under the License. import os +import re import subprocess +import tempfile import pytest -from defs.conftest import skip_arm, skip_no_hopper -from defs.trt_test_alternative import check_call, popen +import yaml +from defs.conftest import llm_models_root, skip_arm, skip_no_hopper +from defs.trt_test_alternative import check_call, check_output, popen from tensorrt_llm.logger import logger @@ -1051,3 +1054,227 @@ def test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp( "deepseek_v3_lite_fp8_tp1_two_mtp", env=llm_venv._new_env, cwd=llm_venv.get_working_directory()) + + +@pytest.fixture(scope="module") +def benchmark_root(): + llm_root = os.getenv("LLM_ROOT") + return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts") + + +@pytest.fixture(scope="module") +def shared_gpt_path(): + DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models") + LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT) + return os.path.join(LLM_MODELS_ROOT, "datasets", + "ShareGPT_V3_unfiltered_cleaned_split.json") + + +@pytest.fixture(scope="function") +def benchmark_model_root(request): + models_root = llm_models_root() + if (request.param == "DeepSeek-V3-Lite-fp8"): + model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8") + elif (request.param == "DeepSeek-V3-Lite-bf16"): + model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16") + elif request.param == "llama-v3-8b-hf": + model_path = os.path.join(models_root, "llama-models-v3", "8B") + elif request.param == "llama-3.1-8b-instruct-hf-fp8": + model_path = os.path.join(models_root, "llama-3.1-model", + "Llama-3.1-8B-Instruct-FP8") + else: + raise ValueError(f"Failed to find the model: {request.param}") + return model_path + + +def run_disaggregated_benchmark(example_dir, + config_file, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=None, + cwd=None): + """Run disaggregated test with given configuration.""" + run_env = env.copy() + run_env["UCX_TLS"] = "^ib" + num_rank = 2 + workers_cmd = [ + 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', + str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c', + config_file + ] + + server_start_timeout = 900 + server_cmd = [ + 'trtllm-serve', 'disaggregated', '--server_start_timeout', + str(server_start_timeout), '-c', config_file + ] + try: + with ( # Start workers + open('output_workers.log', 'w') as output_workers, + popen(workers_cmd, + stdout=output_workers, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as workers_proc, + # Start server + open('output_disagg.log', 'w') as output_disagg, + popen(server_cmd, + stdout=output_disagg, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as server_proc): + # Ensure the sever has started + client_dir = f"{example_dir}/clients" + client_cmd = [ + 'python3', f'{client_dir}/disagg_client.py', '-c', + f'{example_dir}/disagg_config.yaml', '-p', + f'{client_dir}/prompts.json', '--ignore-eos', + '--server-start-timeout', + str(server_start_timeout) + ] + # Warm up + check_call(client_cmd, + env=env, + poll_procs=[workers_proc, server_proc]) + # Start Benchmark + benchmark_script = os.path.join(benchmark_root, + "benchmark_serving.py") + benchmark_cmd = [ + 'python3', + benchmark_script, + '--model', + benchmark_model_root, + '--tokenizer', + benchmark_model_root, + '--dataset-name', + 'random', + '--dataset-path', + shared_gpt_path, + '--random-input-len', + '256', + '--random-output-len', + '64', + '--random-prefix-len', + '0', + '--num-prompts', + '320', + '--max-concurrency', + '32', + '--host', + 'localhost', + '--port', + '8000', + '--ignore-eos', + '--no-test-input', + '--percentile-metrics', + 'e2el,ttft', + ] + # warm up + check_call(benchmark_cmd, env=env) + output = check_output(benchmark_cmd, env=env) + e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)" + ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)" + e2el_match = re.search(e2el_pattern, output) + ttft_match = re.search(ttft_pattern, output) + if e2el_match and ttft_match: + median_e2el = float(e2el_match.group(1)) + median_ttft = float(ttft_match.group(1)) + return median_e2el, median_ttft + else: + raise ValueError("No benchmark result found") + + except Exception: + # Print outputs on error + logger.error("-------- Workers output --------") + with open('output_workers.log', 'r') as f: + logger.error(f.read()) + + logger.error("-------- Disagg server output --------") + with open('output_disagg.log', 'r') as f: + logger.error(f.read()) + raise + finally: + server_proc.terminate() + workers_proc.terminate() + server_proc.wait() + workers_proc.wait() + + +def get_config_for_benchmark(model_root, backend): + serve_config = { + "model": model_root, + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "max_batch_size": 2, + "max_num_tokens": 384, + "max_seq_len": 384, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": backend, + "max_tokens_in_buffer": 512, + }, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "max_batch_size": 2, + "max_num_tokens": 384, + "max_seq_len": 384, + "cache_transceiver_config": { + "backend": backend, + "max_tokens_in_buffer": 512, + }, + "urls": ["localhost:8002"] + } + } + return serve_config + + +@pytest.mark.parametrize("benchmark_model_root", [ + 'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf', + 'llama-3.1-8b-instruct-hf-fp8' +], + indirect=True) +def test_disaggregated_benchmark_on_diff_backends( + disaggregated_test_root, disaggregated_example_root, llm_venv, + benchmark_model_root, benchmark_root, shared_gpt_path): + nixl_config = get_config_for_benchmark(benchmark_model_root, "nixl") + ucx_config = get_config_for_benchmark(benchmark_model_root, "ucx") + temp_dir = tempfile.TemporaryDirectory() + nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml") + ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml") + with open(nixl_config_path, 'w', encoding='utf-8') as f: + yaml.dump(nixl_config, f) + with open(ucx_config_path, 'w', encoding='utf-8') as f: + yaml.dump(ucx_config, f) + + env = llm_venv._new_env.copy() + nixl_e2el, nixl_ttft = run_disaggregated_benchmark( + disaggregated_example_root, + nixl_config_path, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=env, + cwd=llm_venv.get_working_directory()) + ucx_e2el, ucx_ttft = run_disaggregated_benchmark( + disaggregated_example_root, + ucx_config_path, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=env, + cwd=llm_venv.get_working_directory()) + print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms") + print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms") + + assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el + assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 037bb0c3d29..4a1f44dbb29 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -573,6 +573,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype +accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index ae7008f815d..42ec8d21a91 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -110,6 +110,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] +accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index ce8058136fa..5ec16996e7c 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -18,8 +18,7 @@ l0_a30: - unittest/_torch/modeling -k "modeling_phi3" - unittest/_torch/modeling -k "modeling_qwen" - unittest/_torch/modeling -k "modeling_qwen_moe" - - unittest/_torch/modeling -k "modeling_exaone4" - - unittest/_torch/auto_deploy/unit/singlegpu + - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - unittest/_torch/test_beam_search.py - condition: ranges: diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 730cd016743..26b4b2a0a88 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -70,7 +70,7 @@ l0_b200: - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_deepseek" - unittest/_torch/modeling -k "modeling_gpt_oss" - - unittest/_torch/auto_deploy/unit/singlegpu + - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - unittest/_torch/speculative/test_eagle3.py - unittest/_torch/speculative/test_kv_cache_reuse.py - unittest/_torch/speculative/test_dynamic_spec_decode.py diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 29b83ac0778..ca23535a199 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -79,3 +79,10 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4] + - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] + - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 29d8efac07f..798353ddc02 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -52,6 +52,8 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] + - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend - test_e2e.py::test_ptp_quickstart_advanced_bs1 - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8] - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism @@ -110,6 +112,10 @@ l0_dgx_h100: - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 43ee39de1af..1a8fded524b 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -15,6 +15,7 @@ l0_h100: tests: # ------------- PyTorch tests --------------- # Only key models in H100: llama/mixtral/nemotron/deepseek + - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90) - unittest/_torch -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 71643434923..026eeeca5c4 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -256,7 +256,6 @@ unittest/trt/attention/test_gpt_attention.py -k "partition3" SKIP (https://nvbug test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5414909) unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673) unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673) -examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5419066) examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288) examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067) examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068) @@ -294,3 +293,4 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466) disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b] SKIP (https://nvbugs/5445642) examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530) +examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py index 2985e662b27..f5ec68e28d9 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py @@ -1,14 +1,231 @@ +import json +import re import subprocess import tempfile from pathlib import Path +import pytest import yaml from _model_test_utils import _hf_model_dir_or_hub_id -from click.testing import CliRunner from utils.cpp_paths import llm_root # noqa: F401 from utils.llm_data import llm_models_root -from tensorrt_llm.commands.bench import main + +def parse_kv_cache_metrics(log_output: str, free_mem_ratio: float = 0.8): + """Parse KV cache metrics from the benchmark log output.""" + metrics = {} + + # Simple patterns based on actual log format + patterns = { + "current_cache_size": r"Current cache size:\s*(\d+)", + "free_mem_pre_mb": r"Free memory before forward pass \(MB\):\s*(\d+)", + "free_mem_post_mb": r"Free memory after forward pass \(MB\):\s*(\d+)", + } + + # Extract metrics using simple regex patterns + for metric_name, pattern in patterns.items(): + match = re.search(pattern, log_output, re.IGNORECASE) + if match: + value = int(match.group(1)) + metrics[metric_name] = value + print(f" ✅ Found {metric_name}: {value}") + else: + print(f" ❌ Could not find {metric_name}") + + # Calculate new_cache_size using the same formula as in resize_kv_cache + # new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size + if "free_mem_post_mb" in metrics and "current_cache_size" in metrics: + metrics["new_cache_size"] = int( + metrics["free_mem_post_mb"] * 1024 * 1024 * free_mem_ratio + + metrics["current_cache_size"] + ) + print( + f" ✅ Calculated new_cache_size: {metrics['new_cache_size']} (using free_mem_ratio={free_mem_ratio})" + ) + else: + print(" ❌ Cannot calculate new_cache_size - missing required metrics") + + return metrics + + +def run_benchmark( + model_name: str, + dataset_path: str, + temp_dir: str, + backend: str = "_autodeploy", + report_json_path: str = None, + max_batch_size: int = 32, + num_hidden_layers: int = 2, + free_mem_ratio: float = 0.1, +): + """Run benchmark and capture KV cache metrics from log output.""" + + # Read the test config to get free_mem_ratio + config_path = f"{temp_dir}/extra_llm_api_options.yaml" + + # Build the command to run the benchmark + cmd = [ + "python", + "-m", + "tensorrt_llm.commands.bench", + "--model", + model_name, + "throughput", + "--backend", + backend, + "--dataset", + str(dataset_path), + "--max_batch_size", + str(max_batch_size), + ] + + # Add report_json argument if path is provided + if report_json_path: + cmd.extend(["--report_json", report_json_path]) + + if backend == "_autodeploy": + # Add extra_llm_api_options only for autodeploy backend + cmd.extend(["--extra_llm_api_options", config_path]) + + # Run benchmark as subprocess to capture ALL output + import os + + env = os.environ.copy() + if backend == "pytorch": + env["TLLM_OVERRIDE_LAYER_NUM"] = str(num_hidden_layers) + print(f"📋 Using TLLM_OVERRIDE_LAYER_NUM from env: {env['TLLM_OVERRIDE_LAYER_NUM']}") + cmd.extend(["--kv_cache_free_gpu_mem_fraction", str(free_mem_ratio)]) + print(f"🚀 Running benchmark command ({backend} backend): {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=600) + + # Check if the command succeeded + assert result.returncode == 0, ( + f"Benchmark failed with return code {result.returncode}:\n" + f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" + ) + + # Combine stdout and stderr for parsing + full_log_output = f"{result.stdout}\n{result.stderr}" + + # Parse KV cache metrics from the combined log output (only for autodeploy backend) + kv_cache_metrics = {} + if backend == "_autodeploy": + kv_cache_metrics = parse_kv_cache_metrics(full_log_output, free_mem_ratio) + print("📊 KV Cache Metrics parsed from logs:") + if kv_cache_metrics: + for key, value in kv_cache_metrics.items(): + if "mb" in key.lower(): + print(f" {key}: {value}MB") + else: + print(f" {key}: {value} bytes") + else: + print(" âš ī¸ No KV cache metrics were parsed successfully") + else: + print(f"📊 KV Cache Metrics: Skipped for {backend} backend") + + # Return parsed JSON report with KV cache metrics if requested + if report_json_path and Path(report_json_path).exists(): + with open(report_json_path, "r") as f: + report_data = json.load(f) + + # Add KV cache metrics to the report (only for autodeploy backend) + if backend == "_autodeploy": + report_data["kv_cache_metrics"] = kv_cache_metrics + report_data["backend"] = backend + return report_data + return None + + +def compare_backends_performance( + autodeploy_tokens_per_sec: float, + pytorch_tokens_per_sec: float, + relative_tolerance: float = 0.20, + absolute_tolerance: float = 10.0, +): + """ + Compare performance between autodeploy and pytorch backends. + Fails if autodeploy is significantly worse than pytorch. + + Args: + autodeploy_tokens_per_sec: Performance of autodeploy backend + pytorch_tokens_per_sec: Performance of pytorch backend + relative_tolerance: Relative tolerance (20% by default for backend comparison) + absolute_tolerance: Absolute tolerance (10 tokens/sec by default) + """ + # Calculate performance difference + performance_diff = pytorch_tokens_per_sec - autodeploy_tokens_per_sec + relative_diff = performance_diff / pytorch_tokens_per_sec if pytorch_tokens_per_sec > 0 else 0 + + print("=== BACKEND PERFORMANCE COMPARISON ===") + print(f"PyTorch backend: {pytorch_tokens_per_sec:.2f} tokens/sec/user") + print(f"Autodeploy backend: {autodeploy_tokens_per_sec:.2f} tokens/sec/user") + print(f"Performance difference: {performance_diff:.2f} tokens/sec ({relative_diff:.2%})") + + # If autodeploy is better than or equal to pytorch, always pass + if autodeploy_tokens_per_sec >= pytorch_tokens_per_sec: + print("✅ Autodeploy backend matches or exceeds PyTorch backend performance") + return + + # Autodeploy is slower - check if it's within acceptable tolerance + within_relative_tolerance = relative_diff <= relative_tolerance + within_absolute_tolerance = performance_diff <= absolute_tolerance + + if within_relative_tolerance or within_absolute_tolerance: + print("✅ Autodeploy backend performance within acceptable tolerance") + print( + f" Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute" + ) + else: + assert False, ( + f"Autodeploy backend significantly underperforms compared to PyTorch! " + f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user, " + f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user, " + f"Performance gap: {performance_diff:.2f} tokens/sec ({relative_diff:.2%}), " + f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute" + ) + + +def assert_performance_within_tolerance( + actual_tokens_per_sec: float, + golden_tokens_per_sec: float, + relative_tolerance: float = 0.15, + absolute_tolerance: float = 10.0, +): + """ + Assert that actual performance is within tolerance of golden result. + Only fails if performance is WORSE than golden - improvements always pass. + + Args: + actual_tokens_per_sec: Measured performance metric + golden_tokens_per_sec: Expected performance metric + relative_tolerance: Relative tolerance (15% by default) + absolute_tolerance: Absolute tolerance (10 tokens/sec by default) + """ + # If actual performance is better than or equal to golden, always pass + if actual_tokens_per_sec >= golden_tokens_per_sec: + print( + f"✅ Performance improvement detected:" + f" {actual_tokens_per_sec:.2f} >= {golden_tokens_per_sec:.2f} tokens/sec/user" + ) + return + + # Performance is worse than golden - check if it's within acceptable tolerance + performance_drop = golden_tokens_per_sec - actual_tokens_per_sec + relative_drop = ( + performance_drop / golden_tokens_per_sec if golden_tokens_per_sec > 0 else float("inf") + ) + + # Performance should be within relative tolerance OR absolute tolerance + within_relative_tolerance = relative_drop <= relative_tolerance + within_absolute_tolerance = performance_drop <= absolute_tolerance + + assert within_relative_tolerance or within_absolute_tolerance, ( + f"Performance regression detected! " + f"Actual: {actual_tokens_per_sec:.2f} tokens/sec/user, " + f"Golden: {golden_tokens_per_sec:.2f} tokens/sec/user, " + f"Performance drop: {performance_drop:.2f} tokens/sec ({relative_drop:.2%}), " + f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute" + ) def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): @@ -17,7 +234,7 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): dataset_tool = Path(root_dir, "benchmarks", "cpp", "prepare_dataset.py") script_dir = Path(root_dir, "benchmarks", "cpp") - # Generate a small dataset to run a test. + # Generate a small dataset to run a test - matching workload configuration command = [ "python3", f"{dataset_tool}", @@ -37,7 +254,9 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): "10", ] print(f"Running command: {' '.join(command)}") - result = subprocess.run(command, cwd=str(script_dir), capture_output=True, text=True) + result = subprocess.run( + command, cwd=str(script_dir), capture_output=True, text=True, timeout=300 + ) if result.returncode != 0: raise RuntimeError(f"Failed to prepare dataset: {result.stderr}") # Grab the stdout and write it to a dataset file for passing to suite. @@ -46,22 +265,324 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): return dataset_path -def run_benchmark(model_name: str, dataset_path: str, temp_dir: str): - runner = CliRunner() +def calculate_expected_kv_cache_metrics(free_mem_ratio: float): + """Calculate expected KV cache metrics based on actual GPU memory.""" + try: + import torch - args = [ - "--model", - model_name, - "throughput", - "--backend", - "_autodeploy", - "--dataset", - dataset_path, - "--extra_llm_api_options", - f"{temp_dir}/model_kwargs.yaml", + if torch.cuda.is_available(): + # Get total GPU memory in MB + _, total_mem_bytes = torch.cuda.mem_get_info(0) + total_mem_mb = total_mem_bytes // (1024 * 1024) + + # Estimate expected values based on model size + # For TinyLlama-1.1B, model should be 2.2GB + estimated_model_size_mb = 2200 # Conservative estimate + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/6335 check why there is extra consumption + extra_consumption_mb = 2500 + expected_free_mem_range = ( + total_mem_mb - estimated_model_size_mb - extra_consumption_mb, + total_mem_mb - estimated_model_size_mb, + ) + + # Current cache size is typically small initially (16MB range) + expected_current_cache_size = 16777216 + + # Free memory values should be in reasonable range + expected_free_mem_pre_range = expected_free_mem_range + expected_free_mem_post_range = ( + expected_free_mem_range[0] - 1000, + expected_free_mem_range[1] - 500, + ) + + print("📊 GPU Memory Analysis:") + print(f" Total GPU memory: {total_mem_mb}MB") + print( + f" Expected free memory range: {expected_free_mem_range[0]}-{expected_free_mem_range[1]}MB" + ) + + return { + "total_mem_mb": total_mem_mb, + "expected_current_cache_size": expected_current_cache_size, + "expected_free_mem_pre_range": expected_free_mem_pre_range, + "expected_free_mem_post_range": expected_free_mem_post_range, + "free_mem_ratio": free_mem_ratio, + } + else: + return None + except ImportError: + return None + + +def validate_kv_cache_metrics_dynamic(kv_cache_metrics: dict, expected_metrics: dict): + """Validate KV cache metrics using dynamic expected values.""" + + # Validate current_cache_size (should be relatively stable) + current_cache_size = kv_cache_metrics.get("current_cache_size") + expected_cache_size = expected_metrics["expected_current_cache_size"] + if current_cache_size: + cache_diff = abs(current_cache_size - expected_cache_size) / expected_cache_size + assert cache_diff <= 0.5, ( # 50% tolerance for cache size + f"Current cache size outside expected range: {current_cache_size} vs expected ~{expected_cache_size}" + ) + print(f" ✅ current_cache_size: {current_cache_size} bytes (within range)") + + # Validate free memory values are in reasonable ranges + free_mem_pre = kv_cache_metrics.get("free_mem_pre_mb") + free_mem_post = kv_cache_metrics.get("free_mem_post_mb") + + if free_mem_pre: + pre_range = expected_metrics["expected_free_mem_pre_range"] + assert pre_range[0] <= free_mem_pre <= pre_range[1], ( + f"Free memory before forward pass outside expected range: " + f"{free_mem_pre}MB not in range {pre_range[0]}-{pre_range[1]}MB" + ) + print(f" ✅ free_mem_pre_mb: {free_mem_pre}MB (within range)") + + if free_mem_post: + post_range = expected_metrics["expected_free_mem_post_range"] + assert post_range[0] <= free_mem_post <= post_range[1], ( + f"Free memory after forward pass outside expected range: " + f"{free_mem_post}MB not in range {post_range[0]}-{post_range[1]}MB" + ) + print(f" ✅ free_mem_post_mb: {free_mem_post}MB (within range)") + + # Validate memory consumption (pre should be > post) + if free_mem_pre and free_mem_post: + memory_consumed = free_mem_pre - free_mem_post + assert memory_consumed > 0, ( + f"Expected memory consumption during forward pass, got {memory_consumed}MB" + ) + assert memory_consumed < 5000, f"Memory consumption too high: {memory_consumed}MB" + print(f" ✅ Memory consumed during forward pass: {memory_consumed}MB (reasonable)") + + # Validate calculated new_cache_size + new_cache_size = kv_cache_metrics.get("new_cache_size") + if new_cache_size and free_mem_post and current_cache_size: + expected_new_cache = int( + free_mem_post * 1024 * 1024 * expected_metrics["free_mem_ratio"] + current_cache_size + ) + cache_size_diff = abs(new_cache_size - expected_new_cache) / expected_new_cache + assert cache_size_diff <= 0.01, ( # 1% tolerance for calculated value + f"Calculated new_cache_size mismatch: {new_cache_size} vs expected {expected_new_cache}" + ) + print(f" ✅ new_cache_size: {new_cache_size} bytes (calculation correct)") + + +def extract_performance_metric(report_data, report_name="benchmark"): + """Extract performance metric from a benchmark report with validation.""" + assert report_data is not None, f"Failed to capture {report_name} report" + assert "performance" in report_data, f"Performance metrics not found in {report_name} report" + + tokens_per_sec = report_data["performance"].get("output_throughput_per_user_tok_s") + assert tokens_per_sec is not None, ( + f"output_throughput_per_user_tok_s not found in {report_name} performance metrics" + ) + + return tokens_per_sec + + +def validate_and_extract_kv_cache_metrics(report_data, free_mem_ratio, require_metrics=True): + """ + Validate and extract KV cache metrics from report. + + Args: + report_data: The benchmark report data + free_mem_ratio: Free memory ratio for calculating expected metrics + require_metrics: If True, fail when metrics are missing. If False, just warn. + + Returns: + Tuple of (kv_cache_metrics, expected_metrics) or (None, None) if validation fails + """ + required_metrics = [ + "current_cache_size", + "free_mem_pre_mb", + "free_mem_post_mb", + "new_cache_size", ] - result = runner.invoke(main, args, catch_exceptions=False) - assert result.exit_code == 0 + + # Extract KV cache metrics + kv_cache_metrics = report_data.get("kv_cache_metrics", {}) + + if not kv_cache_metrics: + message = ( + "KV cache metrics not found! " + "The autodeploy backend must log memory statistics for this test to pass. " + f"Expected metrics: {', '.join(required_metrics)}" + ) + if require_metrics: + assert False, f"REQUIRED {message}" + else: + print(f"â„šī¸ {message}") + assert False, "KV cache metrics are missing" + + # Check for missing metrics + missing_metrics = [metric for metric in required_metrics if metric not in kv_cache_metrics] + + if missing_metrics: + message = ( + f"Missing required KV cache metrics: {missing_metrics}. " + f"Found metrics: {list(kv_cache_metrics.keys())}. " + f"All of {required_metrics} are required for the test to pass." + ) + if require_metrics: + assert False, message + else: + print(f"â„šī¸ KV cache validation skipped - {message}") + assert False, "KV cache metrics are missing" + + # Calculate expected metrics + expected_metrics = calculate_expected_kv_cache_metrics(free_mem_ratio) + assert expected_metrics, "Could not determine expected metrics for this GPU" + + return kv_cache_metrics, expected_metrics + + +def print_kv_cache_metrics(kv_cache_metrics): + """Print KV cache metrics in a formatted way.""" + print("=== KV CACHE METRICS (DYNAMIC VALIDATION) ===") + for metric_name, actual_value in kv_cache_metrics.items(): + if "mb" in metric_name.lower(): + print(f"{metric_name}: {actual_value}MB") + else: + print(f"{metric_name}: {actual_value} bytes") + + +def trtllm_bench_unified_comparison( + llm_root, # noqa: F811 + comparison_mode="backend", + free_mem_ratio=0.1, + num_hidden_layers=2, + max_batch_size=32, # below this value the kv cache resizing is skipped + golden_tokens_per_sec=1400, + backend_relative_tolerance=0.2, + backend_absolute_tolerance=250.0, + golden_relative_tolerance=0.1, + golden_absolute_tolerance=5.0, +): + """ + Unified test that compares autodeploy backend performance in two modes: + - "backend": compares against pytorch backend performance + - "golden": compares against predefined golden performance values + + Args: + llm_root: Root directory for LLM models (pytest fixture) + comparison_mode: Either "backend" or "golden" to determine comparison type + free_mem_ratio: Ratio of free memory to use for KV cache + num_hidden_layers: Number of hidden layers for the model + max_batch_size: Maximum batch size for benchmarking + golden_tokens_per_sec: Golden performance value in tokens/sec/user + backend_relative_tolerance: Relative tolerance for backend comparison + backend_absolute_tolerance: Absolute tolerance for backend comparison + golden_relative_tolerance: Relative tolerance for golden comparison + golden_absolute_tolerance: Absolute tolerance for golden comparison + """ + model_name = _hf_model_dir_or_hub_id( + f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + ) + + with tempfile.TemporaryDirectory() as temp_dir: + with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f: + yaml.dump( + { + "model_kwargs": {"num_hidden_layers": num_hidden_layers}, + "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32], + "compile_backend": "torch-opt", + "free_mem_ratio": free_mem_ratio, + "runtime": "trtllm", + }, + f, + ) + + dataset_path = prepare_dataset(llm_root, temp_dir, model_name) + + # Always run autodeploy backend + autodeploy_report_path = f"{temp_dir}/autodeploy_report.json" + print("=== RUNNING AUTODEPLOY BACKEND ===") + autodeploy_report = run_benchmark( + model_name, + dataset_path, + temp_dir, + "_autodeploy", + autodeploy_report_path, + max_batch_size, + num_hidden_layers, + free_mem_ratio, + ) + + # Extract autodeploy performance metrics + autodeploy_tokens_per_sec = extract_performance_metric(autodeploy_report, "autodeploy") + + # Validate and extract KV cache metrics (now required for both modes after user's changes) + kv_cache_metrics, expected_metrics = validate_and_extract_kv_cache_metrics( + autodeploy_report, free_mem_ratio, require_metrics=True + ) + + if comparison_mode == "backend": + # Backend comparison mode: also run pytorch backend + pytorch_report_path = f"{temp_dir}/pytorch_report.json" + print("=== RUNNING PYTORCH BACKEND ===") + pytorch_report = run_benchmark( + model_name, + dataset_path, + temp_dir, + "pytorch", + pytorch_report_path, + max_batch_size, + num_hidden_layers, + free_mem_ratio, + ) + + # Extract pytorch performance metrics + pytorch_tokens_per_sec = extract_performance_metric(pytorch_report, "pytorch") + + # Compare backend performance + compare_backends_performance( + autodeploy_tokens_per_sec, + pytorch_tokens_per_sec, + relative_tolerance=backend_relative_tolerance, + absolute_tolerance=backend_absolute_tolerance, + ) + + # Validate KV cache metrics + validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics) + print("✅ KV Cache Metrics validation passed") + + print("=== BACKEND COMPARISON TEST PASSED ===") + print(f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user") + print(f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user") + + elif comparison_mode == "golden": + # Golden comparison mode: compare against golden values + print("=== PERFORMANCE METRICS ===") + print(f"Measured performance: {autodeploy_tokens_per_sec:.2f} tokens/sec/user") + print(f"Golden performance: {golden_tokens_per_sec:.2f} tokens/sec/user") + + # Print KV cache metrics + print_kv_cache_metrics(kv_cache_metrics) + + # Performance validation + assert_performance_within_tolerance( + autodeploy_tokens_per_sec, + golden_tokens_per_sec, + relative_tolerance=golden_relative_tolerance, + absolute_tolerance=golden_absolute_tolerance, + ) + + # KV cache metrics validation + print( + f"Validating {len(kv_cache_metrics)} KV cache metrics against GPU-specific ranges..." + ) + validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics) + + print("=== ALL TESTS PASSED ===") + print(f"Performance: ✅ {autodeploy_tokens_per_sec:.2f} tokens/sec/user within bounds") + print("KV Cache Metrics: ✅ All metrics within GPU-specific expected ranges") + + else: + raise ValueError( + f"Invalid comparison_mode: {comparison_mode}. Must be 'backend' or 'golden'" + ) def test_trtllm_bench(llm_root): # noqa: F811 @@ -70,15 +591,20 @@ def test_trtllm_bench(llm_root): # noqa: F811 ) with tempfile.TemporaryDirectory() as temp_dir: - with open(f"{temp_dir}/model_kwargs.yaml", "w") as f: + with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f: yaml.dump( { "model_kwargs": {"num_hidden_layers": 2}, "cuda_graph_batch_sizes": [1, 2], - "max_batch_size": 128, }, f, ) dataset_path = prepare_dataset(llm_root, temp_dir, model_name) run_benchmark(model_name, dataset_path, temp_dir) + + +@pytest.mark.no_xdist +def test_trtllm_bench_backend_comparison(llm_root): # noqa: F811 + """Test that compares autodeploy backend performance against pytorch backend.""" + trtllm_bench_unified_comparison(llm_root, comparison_mode="backend") diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 541965b588f..e519df1cf2c 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -471,6 +471,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config(): # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high # https://jirasw.nvidia.com/browse/TRTLLM-5045 +@pytest.mark.skip(reason="https://nvbugs/5448464") @skip_gpu_memory_less_than_138gb def test_nemotron_nas_lora() -> None: lora_config = LoraConfig(lora_dir=[