From e88cb92f2490fb7f76dfdd17a144f92ce3f19d16 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:47:14 +0800 Subject: [PATCH 01/20] [None] [feat] Support accurate device iter time (#6906) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index af39b213042..4573528e314 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -403,6 +403,16 @@ def _profiler(self): it = -1 enabled = False start_time = None + + # These events are used to record the time of the previous batch. + # We need two set of the start-end events to record the time through + # a ping-pong way so that it works with overlap scheduler. + start_event_1 = None + end_event_1 = torch.cuda.Event(enable_timing=True) + start_event_2 = None + end_event_2 = torch.cuda.Event(enable_timing=True) + prev_device_step_time = None + torch_trace_path = os.environ.get(PROFILE_TRACE_ENV_VAR_NAME, None) profile_start_stop = os.environ.get(PROFILE_START_STOP_ENV_VAR_NAME, None) @@ -425,7 +435,7 @@ def _profiler(self): with_modules=True) def profile_step(): - nonlocal it, enabled, start_time + nonlocal it, enabled, start_time, start_event_1, end_event_1, start_event_2, end_event_2, prev_device_step_time if it in self.profile_stop_iters and not self.is_warmup: assert enabled, "Inconsistent CUDA profiling state" if enable_torch_trace: @@ -438,7 +448,24 @@ def profile_step(): if start_time is not None and self.print_log and self.dist.rank == 0: end_time = time.time() - + if it % 2 == 0: + end_event_1.record() + if start_event_2 is not None: + end_event_2.synchronize() + prev_device_step_time = start_event_2.elapsed_time( + end_event_2) + else: + end_event_2.record() + if start_event_1 is not None: + end_event_1.synchronize() + prev_device_step_time = start_event_1.elapsed_time( + end_event_1) + + if prev_device_step_time is None: + prev_device_step_time = "N/A" # Handle first iteration + else: + prev_device_step_time = f"{prev_device_step_time}ms" + host_step_time = (end_time - start_time) * 1000 # milliseconds formatted_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") logger.info( @@ -447,7 +474,8 @@ def profile_step(): f"rank = {self.dist.rank}, " f"currank_total_requests = {self.executor_request_queue.num_fetch_requests_cur_rank}/" f"{self.executor_request_queue.num_fetch_requests}, " - f"elapsed_time = {end_time - start_time}s, " + f"host_step_time = {host_step_time}ms, " + f"prev_device_step_time = {prev_device_step_time}, " f"timestamp = {formatted_timestamp}, " f"num_scheduled_requests: {self.num_scheduled_requests}, " f"states = {self.model_engine.iter_states}") @@ -462,6 +490,14 @@ def profile_step(): logger.info(f"Profiling started at iteration {it}.") enabled = True start_time = time.time() + if it % 2 == 0: + if start_event_1 is None: + start_event_1 = torch.cuda.Event(enable_timing=True) + start_event_1.record() + else: + if start_event_2 is None: + start_event_2 = torch.cuda.Event(enable_timing=True) + start_event_2.record() try: yield profile_step @@ -1400,7 +1436,7 @@ def _forward_step(self, new_tensors_device: Optional[SampleStateTensors] = None): @nvtx_range( - f"[Executor] _forward_step {self.model_engine.iter_counter}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs" + f"[Executor] _forward_step {self.model_engine.iter_counter + 1}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs" ) def forward(scheduled_requests, resource_manager, new_tensors_device, gather_context_logits, cache_indirection_buffer): From 5ec15b98f0692471693ee1f086a351c6afc25953 Mon Sep 17 00:00:00 2001 From: Shi Xiaowei <39303645+Shixiaowei02@users.noreply.github.com> Date: Mon, 18 Aug 2025 14:33:23 +0800 Subject: [PATCH 02/20] [TRTLLM-7030][fix] uppercase def value in pd-config (#6981) Signed-off-by: ShiXiaowei02 <39303645+Shixiaowei02@users.noreply.github.com> --- .../test_configs/disagg_config_gen_only_bs1.yaml | 4 ++-- tests/integration/test_lists/waives.txt | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml index 4efbc9a9493..19d1eca714f 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml @@ -17,7 +17,7 @@ context_servers: enable_partial_reuse: False disable_overlap_scheduler: True cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8001" generation_servers: @@ -32,6 +32,6 @@ generation_servers: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False cache_transceiver_config: - backend: default + backend: DEFAULT urls: - "localhost:8002" diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index f681b1b196b..7a75485ec6c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -315,9 +315,5 @@ triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5434320) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5459811) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5459811) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5459811) -disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5459811) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817) From 55f4f2d80c0da73bdd828d695d3964390c4a4821 Mon Sep 17 00:00:00 2001 From: ChristinaZ <83400082+ChristinaZ@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:08:32 +0800 Subject: [PATCH 03/20] [None] [fix] Fix the macro name (#6983) Signed-off-by: Christina Zhang <83400082+ChristinaZ@users.noreply.github.com> --- .../trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu index 601229591e5..f03e02c2e29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu @@ -199,7 +199,7 @@ __global__ void __launch_bounds__(NumThreadsSingleBlock) routingIndicesBlockKern } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) -#if !defined(FDL_PROFILE) || FDL_PROFILE == 0 +#if !defined(PDL_PROFILE) || PDL_PROFILE == 0 // we can trigger the next kernel at this point if constexpr (KernelParams::UsePdl) { From 69ff32f9b1b3c8d7437dabda00d302c74633b4a1 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Mon, 18 Aug 2025 20:34:52 +0800 Subject: [PATCH 04/20] [None][infra] Waive failed tests on main 0818 (#6992) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 7a75485ec6c..2459426b917 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -316,4 +316,7 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5445466) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817) +llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796) +disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811) From 1ce23545fc337f7b240b4d8a983297224497a2dc Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Mon, 18 Aug 2025 21:15:49 +0800 Subject: [PATCH 05/20] [None][chore] Remove duplicate test waives (#6998) Signed-off-by: Yiqing Yan --- tests/integration/test_lists/waives.txt | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2459426b917..49c9a6d0107 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -264,18 +264,14 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype SKIP (https://nvbu accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope SKIP (https://nvbugs/5433543) accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5433545) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5431139) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5431139) -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5431139) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241) -test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060) -test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060) +test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) +test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5433545) -test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095) -test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444095) examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687) @@ -294,10 +290,9 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5403818) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5442827) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5442827,https://nvbugs/5445466) test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5453992) accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (https://nvbugs/5454875) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5445466) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898) examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5434372) From 425dad01fdf24a65132d62a6ad2736ce46b0000f Mon Sep 17 00:00:00 2001 From: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Date: Mon, 18 Aug 2025 08:20:51 -0700 Subject: [PATCH 06/20] [None][fix] Clean up linking to CUDA stub libraries in build_wheel.py (#6823) Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Co-authored-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> --- cpp/tensorrt_llm/nanobind/CMakeLists.txt | 3 +- cpp/tensorrt_llm/pybind/CMakeLists.txt | 3 +- docker/Dockerfile.multi | 5 +- scripts/build_wheel.py | 223 +++++++++++++++-------- 4 files changed, 153 insertions(+), 81 deletions(-) diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt index 8367d007971..1ccb50a02b7 100755 --- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt +++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt @@ -43,6 +43,7 @@ target_link_libraries( ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python + CUDA::cuda_driver ${CUDA_NVML_LIB} th_common) target_compile_definitions( @@ -54,6 +55,6 @@ if(NOT WIN32) ${TRTLLM_NB_MODULE} PROPERTIES LINK_FLAGS - "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" + "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" ) endif() diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt index 9e6e2909395..91b5ebf5482 100755 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -44,6 +44,7 @@ target_link_libraries( ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python + CUDA::cuda_driver ${CUDA_NVML_LIB} th_common) target_compile_definitions( @@ -55,6 +56,6 @@ if(NOT WIN32) ${TRTLLM_PYBIND_MODULE} PROPERTIES LINK_FLAGS - "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" + "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" ) endif() diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index c832481da9f..eeafc8f4a65 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" # Install OpenCV with FFMPEG support -RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/ -RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir +RUN pip3 uninstall -y opencv && \ + rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \ + pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir # WARs against security issues inherited from pytorch:25.06 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 2ac3b484835..e40543c78f3 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -16,8 +16,10 @@ import os import platform +import re import sys import sysconfig +import tempfile import warnings from argparse import ArgumentParser from contextlib import contextmanager @@ -27,7 +29,7 @@ from shutil import copy, copytree, rmtree from subprocess import DEVNULL, CalledProcessError, check_output, run from textwrap import dedent -from typing import List +from typing import Sequence try: from packaging.requirements import Requirement @@ -120,7 +122,8 @@ def create_venv(project_dir: Path): return venv_prefix -def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool): +def setup_venv(project_dir: Path, requirements_file: Path, + no_venv: bool) -> tuple[Path, Path]: """Creates/updates a venv and installs requirements. Args: @@ -279,6 +282,139 @@ def generate_fmha_cu(project_dir, venv_python): os.chdir(project_dir) +def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str: + """ + Creates symbolic links for CUDA stub libraries in a temporary directory. + + Args: + cuda_stub_dir (str): Path to the directory containing CUDA stubs. + missing_libs: Versioned names of the missing libraries. + + Returns: + str: Path to the temporary directory where links were created. + """ + cuda_stub_path = Path(cuda_stub_dir) + if not cuda_stub_path.exists(): + raise RuntimeError( + f"CUDA stub directory '{cuda_stub_dir}' does not exist.") + + # Create a temporary directory for the symbolic links + temp_dir = tempfile.mkdtemp(prefix="cuda_stub_links_") + temp_dir_path = Path(temp_dir) + + version_pattern = r'\.\d+' + for missing_lib in filter(lambda x: re.search(version_pattern, x), + missing_libs): + # Define `so` as the first part of `missing_lib` with trailing '.' and digits removed + so = cuda_stub_path / re.sub(version_pattern, '', missing_lib) + so_versioned = temp_dir_path / missing_lib + + # Check if the library exists in the original directory + if so.exists(): + try: + # Create the symbolic link in the temporary directory + so_versioned.symlink_to(so) + except OSError as e: + # Clean up the temporary directory on error + rmtree(temp_dir) + raise RuntimeError( + f"Failed to create symbolic link for '{missing_lib}' in temporary directory '{temp_dir}': {e}" + ) + else: + warnings.warn( + f"Warning: Source library '{so}' does not exist and was skipped." + ) + + # Return the path to the temporary directory where the links were created + return str(temp_dir_path) + + +def check_missing_libs(so_prefix: str) -> list[str]: + result = build_run(f"ldd {so_prefix}.cpython*.so", + capture_output=True, + text=True) + missing = [] + for line in result.stdout.splitlines(): + if "not found" in line: + lib_name = line.split()[ + 0] # Extract the library name before "=> not found" + if lib_name not in missing: + missing.append(lib_name) + return missing + + +def generate_python_stubs_linux(binding_type: str, venv_python: Path, + deep_ep: bool): + is_nanobind = binding_type == "nanobind" + if is_nanobind: + build_run(f"\"{venv_python}\" -m pip install nanobind") + build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen") + + env_stub_gen = os.environ.copy() + cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get( + "CUDA_PATH") or "/usr/local/cuda" + missing_libs = check_missing_libs("bindings") + cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs" + + if missing_libs and Path(cuda_stub_dir).exists(): + # Create symbolic links for the CUDA stubs + link_dir = create_cuda_stub_links(cuda_stub_dir, missing_libs) + ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH") + env_stub_gen["LD_LIBRARY_PATH"] = ":".join( + filter(None, [link_dir, cuda_stub_dir, ld_library_path])) + else: + link_dir = None + + try: + if is_nanobind: + build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .", + env=env_stub_gen) + else: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", + env=env_stub_gen) + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code", + env=env_stub_gen) + if deep_ep: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", + env=env_stub_gen) + finally: + if link_dir: + rmtree(link_dir) + + +def generate_python_stubs_windows(binding_type: str, venv_python: Path, + pkg_dir: Path, lib_dir: Path): + if binding_type == "nanobind": + print("Windows not yet supported for nanobind stubs") + exit(1) + else: + build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen") + stubgen = "stubgen.py" + stubgen_contents = """ + # Loading torch, trt before bindings is required to avoid import errors on windows. + # isort: off + import torch + import tensorrt as trt + # isort: on + import os + import platform + + from pybind11_stubgen import main + + if __name__ == "__main__": + # Load dlls from `libs` directory before launching bindings. + if platform.system() == "Windows": + os.add_dll_directory(r\"{lib_dir}\") + main() + """.format(lib_dir=lib_dir) + (pkg_dir / stubgen).write_text(dedent(stubgen_contents)) + build_run(f"\"{venv_python}\" {stubgen} -o . bindings") + (pkg_dir / stubgen).unlink() + + def main(*, build_type: str = "Release", generator: str = "", @@ -286,7 +422,7 @@ def main(*, dist_dir: Path = None, cuda_architectures: str = None, job_count: int = None, - extra_cmake_vars: List[str] = list(), + extra_cmake_vars: Sequence[str] = tuple(), extra_make_targets: str = "", trt_root: str = '/usr/local/tensorrt', nccl_root: str = None, @@ -361,7 +497,7 @@ def main(*, if on_windows: # Windows does not support multi-device currently. - extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"]) + extra_cmake_vars = list(extra_cmake_vars) + ["ENABLE_MULTI_DEVICE=0"] # The Ninja CMake generator is used for our Windows build # (Easier than MSBuild to make compatible with our Docker image) @@ -703,81 +839,14 @@ def get_binding_lib(subdirectory, name): dirs_exist_ok=True) if not skip_stubs: - with working_directory(project_dir): - if binding_type == "nanobind": - build_run(f"\"{venv_python}\" -m pip install nanobind") - else: - build_run( - f"\"{venv_python}\" -m pip install pybind11-stubgen") with working_directory(pkg_dir): if on_windows: - if binding_type == "nanobind": - print("Windows not yet supported for nanobind stubs") - exit(1) - else: - stubgen = "stubgen.py" - stubgen_contents = """ - # Loading torch, trt before bindings is required to avoid import errors on windows. - # isort: off - import torch - import tensorrt as trt - # isort: on - import os - import platform - - from pybind11_stubgen import main - - if __name__ == "__main__": - # Load dlls from `libs` directory before launching bindings. - if platform.system() == "Windows": - os.add_dll_directory(r\"{lib_dir}\") - main() - """.format(lib_dir=lib_dir) - (pkg_dir / stubgen).write_text(dedent(stubgen_contents)) - build_run(f"\"{venv_python}\" {stubgen} -o . bindings") - (pkg_dir / stubgen).unlink() - else: - env_ld = os.environ.copy() - - new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real" - if 'LD_LIBRARY_PATH' in env_ld: - new_library_path += f":{env_ld['LD_LIBRARY_PATH']}" - - result = build_run("find /usr -name *libnvidia-ml.so*", - capture_output=True, - text=True) - assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}" - - # Build containers only contain stub version of libnvidia-ml.so and not the real version. - # If real version not in system, we need to create symbolic link to stub version to prevent import errors. - if "libnvidia-ml.so.1" not in result.stdout: - if "libnvidia-ml.so" in result.stdout: - line = result.stdout.splitlines()[0] - path = os.path.dirname(line) - new_library_path += f":{path}" - build_run(f"ln -s {line} {path}/libnvidia-ml.so.1") - else: - print( - f"Failed to find libnvidia-ml.so: {result.stderr}", - file=sys.stderr) - exit(1) - - env_ld["LD_LIBRARY_PATH"] = new_library_path - if binding_type == "nanobind": - build_run( - f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .", - env=env_ld) - else: - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", - env=env_ld) - if deep_ep_cuda_architectures: - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", - env=env_ld) - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code", - env=env_ld) + generate_python_stubs_windows(binding_type, venv_python, + pkg_dir, lib_dir) + else: # on linux + generate_python_stubs_linux( + binding_type, venv_python, + bool(deep_ep_cuda_architectures)) if not skip_building_wheel: if dist_dir is None: From d1d17dbebab05b11e74ac373ec7c55b3326c16b4 Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Tue, 19 Aug 2025 01:35:30 +0800 Subject: [PATCH 07/20] [None][infra] Cherry-pick #6836 from main branch and improve SSH connection (#6971) (#7005) Signed-off-by: Yanchao Lu Co-authored-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> --- jenkins/L0_Test.groovy | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 3ed53788815..c1025efbc7e 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models" ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false +COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){ withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def remote = [ @@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st pipeline.stage('Submit Test Results') { sh "mkdir -p ${stageName}" def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml" - def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/" + def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/" def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0 if (downloadSucceed) { sh "ls ${stageName}" @@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",) Utils.exec( pipeline, @@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL stage('Prepare Testing') { // Create Job Workspace folder in Frontend Node - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",) // Download and Unzip Tar File trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") @@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL // Upload slurm_run_sh to Frontend node def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh" Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",) // Upload waives.txt to Frontend node def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt" - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",) // Generate Test List and Upload to Frontend Node def makoArgs = getMakoArgsFromStageName(stageName, true) @@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL // if the line cannot be split by "=", just ignore that line. def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs) def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",) // Generate Multi Node Job Launch Script def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") @@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL """.stripIndent() pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent) Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true) - Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",) + Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",) } stage('Run Test') { def scriptLaunch = "${jobWorkspace}/slurm_launch.sh" @@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName) usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'), string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME') ]) { - portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true) + portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true) } echo "Port Usage: ${portUsage}" @@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO def llmRootConfig = "${LLM_ROOT}${config}" sh "mkdir ${llmRootConfig}" - def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim() + def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" echoNodeAndGpuInfo(pipeline, stageName) @@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'), string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME') ]) { - sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub" - sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys" - sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}" + sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub" + sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys" + sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}" def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName) if (userPort == 0) { @@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO return } - sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" + sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}" sh "ps aux | grep ssh" try { From d16af87d037f30f7fb487777a602785c1d60ec87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20C=C3=A1mpora?= <961215+dcampora@users.noreply.github.com> Date: Tue, 19 Aug 2025 00:10:05 +0200 Subject: [PATCH 08/20] [TRTLLM-7158][feat] Introduce sampler options in trtllm bench (#6855) Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> --- tensorrt_llm/bench/benchmark/low_latency.py | 25 +++++++++---- tensorrt_llm/bench/benchmark/throughput.py | 24 ++++++++++--- tensorrt_llm/bench/benchmark/utils/general.py | 36 +++++++++++++++++++ 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py index 2ee3e7ea5ce..ad200af9c69 100644 --- a/tensorrt_llm/bench/benchmark/low_latency.py +++ b/tensorrt_llm/bench/benchmark/low_latency.py @@ -25,7 +25,7 @@ from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode # isort: off -from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, ALL_SUPPORTED_BACKENDS +from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, update_sampler_args_with_extra_options, ALL_SUPPORTED_BACKENDS # isort: on from tensorrt_llm.bench.utils.data import (create_dataset_from_stream, initialize_tokenizer, @@ -135,6 +135,13 @@ default=1, help="Number of search beams.", ) +@optgroup.option("--sampler_options", + type=click.Path(exists=True, + readable=True, + path_type=Path, + resolve_path=True), + default=None, + help="Path to a YAML file that sets sampler options.") @optgroup.option( "--concurrency", type=int, @@ -326,12 +333,16 @@ def latency_command( eos_id = tokenizer.eos_token_id if not ignore_eos else -1 pad_id = tokenizer.pad_token_id if not ignore_eos else -1 - sampling_params = SamplingParams( - end_id=eos_id, - pad_id=pad_id, - n=beam_width, - use_beam_search=beam_width > 1, - ) + sampler_args = { + "end_id": eos_id, + "pad_id": pad_id, + "n": beam_width, + "use_beam_search": beam_width > 1 + } + sampler_args = update_sampler_args_with_extra_options( + sampler_args, params.pop("sampler_options")) + sampling_params = SamplingParams(**sampler_args) + post_proc_params = None # No detokenization # Perform warmup if requested. diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index 18d7980ea4d..57c86ac0f34 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -22,7 +22,8 @@ from tensorrt_llm import LLM as PyTorchLLM from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM -from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset +from tensorrt_llm.bench.benchmark.utils.general import ( + generate_warmup_dataset, update_sampler_args_with_extra_options) from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment from tensorrt_llm.bench.dataclasses.reporting import ReportUtility @@ -67,6 +68,13 @@ help= "Path to a YAML file that overwrites the parameters specified by trtllm-bench." ) +@optgroup.option("--sampler_options", + type=click.Path(exists=True, + readable=True, + path_type=Path, + resolve_path=True), + default=None, + help="Path to a YAML file that sets sampler options.") @optgroup.option( "--max_batch_size", type=int, @@ -455,10 +463,16 @@ def ignore_trt_only_args(kwargs: dict): else: llm = LLM(**kwargs) - sampling_params = SamplingParams(end_id=eos_id, - pad_id=eos_id, - n=beam_width, - use_beam_search=beam_width > 1) + sampler_args = { + "end_id": eos_id, + "pad_id": eos_id, + "n": beam_width, + "use_beam_search": beam_width > 1 + } + sampler_args = update_sampler_args_with_extra_options( + sampler_args, params.pop("sampler_options")) + sampling_params = SamplingParams(**sampler_args) + post_proc_params = None # No detokenization # Perform warmup if requested. diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py index 45a7a32c1ba..ff3cd933ce1 100755 --- a/tensorrt_llm/bench/benchmark/utils/general.py +++ b/tensorrt_llm/bench/benchmark/utils/general.py @@ -199,3 +199,39 @@ def generate_warmup_dataset(requests, steps) -> List[InferenceRequest]: warm_up_dataset = choices(requests, k=steps) shuffle(warm_up_dataset) return warm_up_dataset + + +def update_sampler_args_with_extra_options(sampler_args: Dict, + sampler_options: str) -> Dict: + """Update sampler arguments with options from a YAML file. + + Args: + sampler_args: Base sampler arguments dictionary. + sampler_options: Path to YAML file containing additional options. + + Returns: + Dict: Merged sampler arguments. + + Raises: + FileNotFoundError: If the YAML file doesn't exist. + yaml.YAMLError: If the YAML file is malformed. + TypeError: If the YAML content is not a dictionary. + """ + if sampler_options is not None: + try: + with open(sampler_options, 'r') as f: + sampler_options_dict = yaml.safe_load(f) + except FileNotFoundError: + raise FileNotFoundError( + f"Sampler options file not found: {sampler_options}") + except yaml.YAMLError as e: + raise yaml.YAMLError( + f"Invalid YAML in sampler options file {sampler_options}: {e}") + + if not isinstance(sampler_options_dict, dict): + raise TypeError( + f"Sampler options file {sampler_options} must contain a dictionary, " + f"got {type(sampler_options_dict)}") + + sampler_args = sampler_args | sampler_options_dict + return sampler_args From e76e5c640fe97dab94f50f0a02af5e9952d8aca7 Mon Sep 17 00:00:00 2001 From: Leslie Fang Date: Tue, 19 Aug 2025 07:42:52 +0800 Subject: [PATCH 09/20] [None][infra] Enable accuracy test for mtp and chunked prefill (#6314) Signed-off-by: leslie-fang25 --- .../features/feature_combination_matrix.md | 2 +- tests/integration/defs/.test_durations | 20 ++++++------- .../defs/accuracy/test_llm_api_pytorch.py | 7 ++++- .../test_lists/qa/llm_function_full.txt | 2 +- .../test_lists/qa/llm_function_rtx6kd.txt | 20 ++++++------- .../test_lists/qa/llm_function_sanity.txt | 2 +- .../test_lists/test-db/l0_b200.yml | 5 ++-- .../test_lists/test-db/l0_gb200.yml | 3 ++ .../test_lists/test-db/l0_h100.yml | 28 +++++++++---------- 9 files changed, 49 insertions(+), 40 deletions(-) diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md index eee4ca7e155..f39a800fcdb 100644 --- a/docs/source/torch/features/feature_combination_matrix.md +++ b/docs/source/torch/features/feature_combination_matrix.md @@ -7,7 +7,7 @@ | Attention Data Parallelism | Yes | Yes | --- | | | | | | | | | | | | | Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | | | Chunked Prefill | Yes | Yes | Yes | Untested | --- | | | | | | | | | | -| MTP | Yes | Yes | Yes | Yes | Untested | --- | | | | | | | | | +| MTP | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | | | | EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | | | EAGLE-3(Two Model Engine) | NO | Yes | Yes | Yes | Yes | No | No | --- | | | | | | | | Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations index a2ca6317b6c..23a7d075d94 100644 --- a/tests/integration/defs/.test_durations +++ b/tests/integration/defs/.test_durations @@ -281,11 +281,11 @@ "disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]": 98.97588296607137, "disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]": 67.9668476767838, "test_unittests.py::test_unittests_v2[unittest/_torch/test_attention_mla.py]": 26.32902159006335, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 591.2785023800097, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]": 306.84709841990843, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 220.57452515885234, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 202.22269394202158, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 165.08514453098178, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 591.2785023800097, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 306.84709841990843, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 220.57452515885234, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 202.22269394202158, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 165.08514453098178, "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 252.70569713797886, "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 85.24235329206567, "test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]": 81.43792725296225, @@ -305,11 +305,11 @@ "test_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf]": 200.82293555140495, "test_unittests.py::test_unittests_v2[unittest/trt/model/test_llama.py]": 1494.1103300452232, "test_unittests.py::test_unittests_v2[unittest/trt/attention/test_gpt_attention.py -k \"partition0\"]": 77.31474154582247, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 295.3527018489549, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 143.84012729604729, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 107.58471493399702, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]": 205.7252635700861, - "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 113.82226522010751, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 295.3527018489549, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 143.84012729604729, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 107.58471493399702, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 205.7252635700861, + "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 113.82226522010751, "examples/test_llama.py::test_llm_llama_1gpu[llama-3.1-8b-instruct-hf-fp8-enable_fp8-float16-summarization-nb:1]": 853.2910006027669, "test_e2e.py::test_openai_chat_example": 876.1966922096908, "test_e2e.py::test_trtllm_serve_example": 200.09309104084969, diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 649e826207a..89483fd2620 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -930,6 +930,10 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" @pytest.mark.skip_less_device_memory(60000) + # Chunked Prefill for MLA can only be enabled on SM100 + @parametrize_with_ids( + "enable_chunked_prefill", + [False, pytest.param(True, marks=skip_pre_blackwell)]) @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", [(False, False, False), (True, False, False), @@ -939,7 +943,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): @parametrize_with_ids("mtp_nextn", [0, pytest.param(2, marks=skip_pre_hopper)]) def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, - overlap_scheduler, torch_compile): + overlap_scheduler, torch_compile, enable_chunked_prefill): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, @@ -955,6 +959,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config, + enable_chunked_prefill=enable_chunked_prefill, **pytorch_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) as llm: diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index ab43ce124f9..1859762fc17 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -481,7 +481,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_pref accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] diff --git a/tests/integration/test_lists/qa/llm_function_rtx6kd.txt b/tests/integration/test_lists/qa/llm_function_rtx6kd.txt index fbabac6b84f..b3d14c393bb 100644 --- a/tests/integration/test_lists/qa/llm_function_rtx6kd.txt +++ b/tests/integration/test_lists/qa/llm_function_rtx6kd.txt @@ -1,16 +1,16 @@ accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2 accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index aeaa1ba573b..1943a015cf6 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -36,7 +36,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 26b4b2a0a88..cb36129a147 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -17,8 +17,9 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_4] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_64] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index 6e1f6219472..ac39fbdc88c 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -20,6 +20,9 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 481c5f709ff..64f6498d095 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -178,20 +178,20 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=FLASHINFER-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] From 97ba0eb879750ee67bd55920ac511bc66e458adc Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Mon, 18 Aug 2025 17:43:28 -0700 Subject: [PATCH 10/20] [None][autodeploy] Doc: fix link path in trtllm bench doc (#7007) Signed-off-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> --- .../auto_deploy/advanced/benchmarking_with_trtllm_bench.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md index 10515500797..6032aacd4fa 100644 --- a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md +++ b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md @@ -4,7 +4,7 @@ AutoDeploy is integrated with the `trtllm-bench` performance benchmarking utilit ## Getting Started -Before benchmarking with AutoDeploy, review the [TensorRT-LLM benchmarking guide](../../performance/perf-benchmarking.md#running-with-the-pytorch-workflow) to familiarize yourself with the standard trtllm-bench workflow and best practices. +Before benchmarking with AutoDeploy, review the [TensorRT-LLM benchmarking guide](../../../performance/perf-benchmarking.md#running-with-the-pytorch-workflow) to familiarize yourself with the standard trtllm-bench workflow and best practices. ## Basic Usage From dabebb2c7a6994821e045bd6713c092a6f26a0d5 Mon Sep 17 00:00:00 2001 From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:42:35 +0800 Subject: [PATCH 11/20] [https://nvbugs/5371480][fix] Enable test_phi3_small_8k (#6938) Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unittest/llmapi/test_llm_models.py b/tests/unittest/llmapi/test_llm_models.py index 737511d2156..4fbc00ddf76 100644 --- a/tests/unittest/llmapi/test_llm_models.py +++ b/tests/unittest/llmapi/test_llm_models.py @@ -110,7 +110,6 @@ def test_llm_phi_3_mini_4k(): sampling_params=phi3_mini_4k_sampling_params) -@pytest.mark.skip(reason="https://nvbugs/5371480") @force_ampere def test_llm_phi_3_small_8k(): phi_requirement_path = os.path.join( From 71e28eab36f3221f2bda0bfcf5be2045504150d5 Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:58:22 +0800 Subject: [PATCH 12/20] [TRTLLM-7014][chore] Add accuracy test for ctx and gen workers with different models (#6741) Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- .../defs/accuracy/references/gsm8k.yaml | 1 + .../accuracy/test_disaggregated_serving.py | 76 ++++++++++++++----- .../test_lists/qa/llm_function_sanity.txt | 1 + .../test_lists/test-db/l0_dgx_b200.yml | 1 + 4 files changed, 62 insertions(+), 17 deletions(-) diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 29458d3bf49..26de82cbc09 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -80,6 +80,7 @@ Qwen3/Qwen3-8B: kv_cache_quant_algo: FP8 accuracy: 87.1114 Qwen3/Qwen3-30B-A3B: + - accuracy: 83.43 - quant_algo: FP8_BLOCK_SCALES accuracy: 84.36 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 051c5401a06..51a572ce493 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -23,7 +23,7 @@ from tensorrt_llm.llmapi.tokenizer import load_hf_tokenizer from ..conftest import (get_device_count, llm_models_root, parametrize_with_ids, - skip_pre_hopper) + skip_pre_blackwell, skip_pre_hopper) from ..trt_test_alternative import popen from .accuracy_core import (GSM8K, MMLU, JsonModeEval, LlmapiAccuracyTestHarness, get_accuracy_task) @@ -71,7 +71,9 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any], ctx_server_config: Dict[str, Any], gen_server_config: Dict[str, Any], model_name: str, - tensor_parallel_size: int = 1): + tensor_parallel_size: int = 1, + ctx_model: str = None, + gen_model: str = None): temp_dir = tempfile.TemporaryDirectory() disaggregated_serving_config_path = os.path.join( temp_dir.name, "disaggregated_serving_config.yaml") @@ -97,9 +99,19 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any], trtllm_serve_path = "trtllm-serve" # Common arguments for both servers - common_args = [ + ctx_model = ctx_model or model_name + gen_model = gen_model or model_name + ctx_args = [ trtllm_serve_path, - model_name, + ctx_model, + "--host", + "localhost", + "--backend", + "pytorch", + ] + gen_args = [ + trtllm_serve_path, + gen_model, "--host", "localhost", "--backend", @@ -125,11 +137,11 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any], env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1" env_gen["CUDA_VISIBLE_DEVICES"] = ",".join( map(str, range(ctx_total_gpus, ctx_total_gpus + gen_total_gpus))) - ctx_server_args = common_args + [ + ctx_server_args = ctx_args + [ "--port", "8001", "--extra_llm_api_options", ctx_server_config_path, f"--tp_size={ctx_tp}", f"--pp_size={ctx_pp}" ] - gen_server_args = common_args + [ + gen_server_args = gen_args + [ "--port", "8002", "--extra_llm_api_options", gen_server_config_path, f"--tp_size={gen_tp}", f"--pp_size={gen_pp}" ] @@ -226,17 +238,21 @@ def generate_async(prompt: str, disaggregated_server.wait() -def run_parallel_test(model_name: str, model_path: str, ctx_pp: int, - ctx_tp: int, gen_pp: int, gen_tp: int, - test_set: LlmapiAccuracyTestHarness): +def run_parallel_test(model_name: str, + model_path: str, + ctx_pp: int, + ctx_tp: int, + gen_pp: int, + gen_tp: int, + test_sets: List[LlmapiAccuracyTestHarness], + ctx_model: str = None, + gen_model: str = None): if ctx_tp * ctx_pp + gen_tp * gen_pp > get_device_count(): pytest.fail( f"Not enough devices for ctx_pp={ctx_pp}+ctx_tp={ctx_tp} and gen_pp={gen_pp}+gen_tp={gen_tp} test" ) - kv_cache_config = { "free_gpu_memory_fraction": 0.5, - "enable_block_reuse": False } ctx_server_config = { "pipeline_parallel_size": ctx_pp, @@ -270,10 +286,14 @@ def run_parallel_test(model_name: str, model_path: str, ctx_pp: int, } } with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, gen_server_config, - model_path) as llm: - task = test_set(model_name) - task.evaluate(llm) + ctx_server_config, + gen_server_config, + model_path, + ctx_model=ctx_model, + gen_model=gen_model) as llm: + for test_set in test_sets: + task = test_set(model_name) + task.evaluate(llm) @pytest.mark.timeout(3600) @@ -512,7 +532,7 @@ def test_tp_pp_symmetric(self, tp, pp, testset): if tp * pp * 2 > get_device_count(): pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test") return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp, - tp, get_accuracy_task(testset)) + tp, [get_accuracy_task(testset)]) @parametrize_with_ids("ctx_pp", [2, 4]) @parametrize_with_ids("gen_tp", [1, 2]) @@ -522,7 +542,7 @@ def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset): pytest.skip( f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test") return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1, - gen_tp, get_accuracy_task(testset)) + gen_tp, [get_accuracy_task(testset)]) @pytest.mark.skip_less_device_memory(140000) @@ -776,3 +796,25 @@ def test_auto_dtype(self, overlap_scheduler): task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm) + + +@skip_pre_blackwell +@pytest.mark.timeout(3600) +class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): + FP4_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf" + FP8_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf" + + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("ctx_pp,gen_tp", [(2, 2)], ids=["ctxpp2gentp2"]) + def test_mixed_ctx_gen_model(self, ctx_pp, gen_tp): + ctx_model = self.FP4_MODEL + gen_model = self.FP8_MODEL + return run_parallel_test("Qwen3/Qwen3-30B-A3B", + ctx_model, + ctx_pp=ctx_pp, + ctx_tp=1, + gen_pp=1, + gen_tp=gen_tp, + test_sets=[GSM8K, MMLU], + ctx_model=ctx_model, + gen_model=gen_model) diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 1943a015cf6..8dc118d991c 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -25,6 +25,7 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM] accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 2c04beb634a..fb3f518a686 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -70,6 +70,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] + - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM] From a15af879ec62541bef8690e1abc4d0aa1135075c Mon Sep 17 00:00:00 2001 From: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> Date: Tue, 19 Aug 2025 09:58:44 +0800 Subject: [PATCH 13/20] [None][refactor] Refactor Torch Compile Backend, MoeLoadBalancer and warmup Logic (#6615) Signed-off-by: yizhang-nv <187001205+yizhang-nv@users.noreply.github.com> Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> --- tensorrt_llm/_torch/compilation/backend.py | 10 +- .../_torch/compilation/piecewise_optimizer.py | 23 +- tensorrt_llm/_torch/compilation/utils.py | 15 +- .../_torch/custom_ops/torch_custom_ops.py | 9 + .../modules/fused_moe/moe_load_balancer.py | 28 +- .../_torch/modules/multi_stream_utils.py | 33 ++- tensorrt_llm/_torch/pyexecutor/_util.py | 2 +- tensorrt_llm/_torch/pyexecutor/config.py | 1 + .../_torch/pyexecutor/cuda_graph_runner.py | 34 +-- .../_torch/pyexecutor/model_engine.py | 250 ++++++++++-------- tensorrt_llm/_torch/pyexecutor/py_executor.py | 16 +- tensorrt_llm/_torch/utils.py | 10 + tensorrt_llm/llmapi/llm_args.py | 19 ++ .../_torch/modules/test_moe_load_balancer.py | 6 +- 14 files changed, 271 insertions(+), 185 deletions(-) diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py index f6e7ae64905..02e2ae8fe50 100644 --- a/tensorrt_llm/_torch/compilation/backend.py +++ b/tensorrt_llm/_torch/compilation/backend.py @@ -37,7 +37,7 @@ def __init__( enable_inductor=True, enable_userbuffers=False, enable_piecewise_cuda_graph: bool = False, - cuda_graph_batch_sizes: Optional[List[int]] = None, + capture_num_tokens: Optional[List[int]] = None, max_num_streams: int = 1, ) -> None: super().__init__() @@ -48,14 +48,12 @@ def __init__( self.custom_passes = Backend.get_custom_pass(enable_userbuffers) self.rank = tensorrt_llm.mpi_rank() self.enable_inductor = enable_inductor - self.cuda_graph_batch_sizes = (cuda_graph_batch_sizes - if cuda_graph_batch_sizes is not None - else []) + self.capture_num_tokens = capture_num_tokens or [] self.piecewise_cuda_graph = enable_piecewise_cuda_graph self.no_optimization = False # We only need to create aux streams. self.aux_streams = Backend.Streams( - [torch.cuda.Stream() for i in range(max_num_streams - 1)]) + [torch.cuda.Stream() for _ in range(max_num_streams - 1)]) self.events = Backend.Events() inductor_config.enable_auto_functionalized_v2 = False @@ -125,7 +123,7 @@ def optimize( example_inputs, self.enable_inductor, self.input_num_tokens, - self.cuda_graph_batch_sizes, + self.capture_num_tokens, self._graph_pool_handle, len(self.aux_streams) + 1, ) diff --git a/tensorrt_llm/_torch/compilation/piecewise_optimizer.py b/tensorrt_llm/_torch/compilation/piecewise_optimizer.py index f7624e6b161..c83644eed24 100644 --- a/tensorrt_llm/_torch/compilation/piecewise_optimizer.py +++ b/tensorrt_llm/_torch/compilation/piecewise_optimizer.py @@ -14,8 +14,7 @@ from ..utils import (get_model_extra_attrs, get_piecewise_cuda_graph_flag, make_weak_ref) from .multi_stream.auto_multi_stream import multi_stream_schedule -from .utils import (get_enable_piecewise_cuda_graph_capture_flag, - is_call_function) +from .utils import get_capture_piecewise_cuda_graph_flag, is_call_function class PiecewiseInterpreter(Interpreter): @@ -25,7 +24,7 @@ def __init__( module: GraphModule, enable_inductor: bool, compile_time_num_tokens: Union[int | torch.SymInt], - cuda_graph_batch_sizes: list[int], + capture_num_tokens: list[int], exclude_modules_id: list[int], graph_pool_handle: tuple[int, int], garbage_collect_values: bool = True, @@ -37,7 +36,7 @@ def __init__( self.fake_mode = detect_fake_mode() self.compile_time_num_tokens = compile_time_num_tokens - self.cuda_graph_batch_sizes = cuda_graph_batch_sizes + self.capture_num_tokens = capture_num_tokens self.exclude_modules = [f"submod_{i}" for i in exclude_modules_id] self.graph_pool_handle = graph_pool_handle self.enable_inductor = enable_inductor @@ -86,7 +85,7 @@ def call_module(self, target, args, kwargs): target, self.compile_time_num_tokens, runtime_num_tokens_idx, - self.cuda_graph_batch_sizes, + self.capture_num_tokens, self.graph_pool_handle, compile_fx(submod, args) if self.enable_inductor else submod, self.enable_inductor, @@ -120,7 +119,7 @@ def __init__( name: str, compile_time_num_tokens: Union[int | torch.SymInt], runtime_num_tokens_idx: tuple[int], - cuda_graph_batch_sizes: List[int], + capture_num_tokens: List[int], graph_pool_handle, default_callable: Callable, enable_inductor: bool, @@ -139,9 +138,9 @@ def __init__( self.entries: dict[int, Entry] = {} - for bs in cuda_graph_batch_sizes: - self.entries[bs] = Entry( - bs, + for num_tokens in capture_num_tokens: + self.entries[num_tokens] = Entry( + num_tokens, enable_inductor=self.enable_inductor, callable=default_callable, ) @@ -167,7 +166,7 @@ def __call__(self, *args): if entry.cuda_graph is None: - if not get_enable_piecewise_cuda_graph_capture_flag(): + if not get_capture_piecewise_cuda_graph_flag(): return entry.callable(*args) if entry.warmup_count < 3: @@ -228,7 +227,7 @@ def piecewise_optimizer( example_inputs: List[torch.Tensor], enable_inductor: bool, input_num_tokens: Union[int | torch.SymInt], - cuda_graph_batch_sizes: Sequence[int], + capture_num_tokens: Sequence[int], graph_pool_handle: tuple[int, int], max_num_streams: int = 1, ) -> tuple[GraphModule, int]: @@ -269,7 +268,7 @@ def piecewise_optimizer( gm, enable_inductor, input_num_tokens, - cuda_graph_batch_sizes, + capture_num_tokens, exclude_modules_id, graph_pool_handle, max_num_streams=max_num_streams, diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py index fef3de2a067..0166c455d23 100644 --- a/tensorrt_llm/_torch/compilation/utils.py +++ b/tensorrt_llm/_torch/compilation/utils.py @@ -1,3 +1,4 @@ +import contextlib from typing import Callable, List, Union import torch @@ -33,16 +34,26 @@ def is_call_function(node: Node, target: Union[List[Callable], Callable]): _enable_piecewise_cuda_graph_capture = False -def set_enable_piecewise_cuda_graph_capture_flag(enable: bool): +def set_capture_piecewise_cuda_graph_flag(enable: bool): global _enable_piecewise_cuda_graph_capture _enable_piecewise_cuda_graph_capture = enable -def get_enable_piecewise_cuda_graph_capture_flag() -> bool: +def get_capture_piecewise_cuda_graph_flag() -> bool: global _enable_piecewise_cuda_graph_capture return _enable_piecewise_cuda_graph_capture +@contextlib.contextmanager +def capture_piecewise_cuda_graph(enable: bool): + prev_enable = get_capture_piecewise_cuda_graph_flag() + set_capture_piecewise_cuda_graph_flag(enable) + try: + yield + finally: + set_capture_piecewise_cuda_graph_flag(prev_enable) + + def inplace_info(): inplace_map = { torch.ops.trtllm.flashinfer_fused_add_rmsnorm.default: { diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index a323bb4f553..bd946343b09 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -8,6 +8,7 @@ from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec, OptimizationProfile, TunableRunner, TuningConfig) +from ..modules.multi_stream_utils import do_multi_stream from ..utils import (fp4_scale_infer_shape, get_last_power_of_2_num_tokens_buckets, last_positive_power_of_2) @@ -925,6 +926,8 @@ def get_stream(stream_id: int): @torch.library.custom_op("trtllm::set_stream", mutates_args=()) def set_stream(stream_id: int) -> None: + if not do_multi_stream(): + return stream = get_stream(stream_id) assert stream is not None torch.cuda.set_stream(stream) @@ -932,18 +935,24 @@ def set_stream(stream_id: int) -> None: @torch.library.custom_op("trtllm::record_event", mutates_args=()) def record_event(event_idx: int) -> None: + if not do_multi_stream(): + return event = get_event(event_idx) event.record() @torch.library.custom_op("trtllm::wait_event", mutates_args=()) def wait_event(event_idx: int) -> None: + if not do_multi_stream(): + return event = get_event(event_idx) event.wait() @torch.library.custom_op("trtllm::record_stream", mutates_args=()) def record_stream(tensor: torch.Tensor, stream_id: int) -> None: + if not do_multi_stream(): + return stream = get_stream(stream_id) assert stream is not None tensor.record_stream(stream) diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py index 460625fb891..ff26c87687a 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py +++ b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py @@ -9,12 +9,12 @@ import tensorrt_llm import tensorrt_llm.bindings.internal.runtime as _tbr -from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import is_graph_capturing from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping from ...distributed import AllReduce from ...utils import EventType +from ..multi_stream_utils import do_multi_stream def _tensor_to_weight(t: torch.Tensor) -> _tbr.MoeWeight: @@ -472,7 +472,7 @@ def start_wait_gpu_stage(self): assert self.func_called_count["start_wait_gpu_stage"] == 0 self.func_called_count["start_wait_gpu_stage"] += 1 if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.Main].record() with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.Main].wait() @@ -491,7 +491,7 @@ def done_wait_gpu_stage(self): assert self.func_called_count["done_wait_gpu_stage"] == 0 self.func_called_count["done_wait_gpu_stage"] += 1 if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.MoeBalancer].wait() def start_set_cpu_stage(self): @@ -502,7 +502,7 @@ def start_set_cpu_stage(self): assert self.func_called_count["start_set_cpu_stage"] == 0 self.func_called_count["start_set_cpu_stage"] += 1 if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.Main].record() with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.Main].wait() @@ -522,7 +522,7 @@ def done_set_cpu_stage(self): self.func_called_count[name] = 0 self.statistic_flag_tensor = None if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.MoeBalancer].wait() def update_local_statistic(self, local_raw_expert_ids: torch.Tensor, @@ -544,7 +544,7 @@ def update_local_statistic(self, local_raw_expert_ids: torch.Tensor, (self.expert_count, ), dtype=torch.int32, device=torch.device('cuda')) - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.Main].record() with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.Main].wait() @@ -569,7 +569,7 @@ def get_local_statistic_tensor(self) -> Optional[torch.Tensor]: assert self.func_called_count["update_local_statistic"] > 0 self.func_called_count["get_local_statistic_tensor"] += 1 if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.MoeBalancer].record() self.event_dict[EventType.MoeBalancer].wait() @@ -598,7 +598,7 @@ def _update_statistic(): self.single_layer_load_balancer_ptr) if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.Main].record() with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.Main].wait() @@ -636,7 +636,7 @@ def _update_statistic(): if self.updates_enabled: self.update_local_statistic(local_raw_expert_ids, is_first_stage, is_last_stage) - if is_graph_capturing(): + if do_multi_stream(): with torch.cuda.stream(self.aux_stream): _update_statistic() else: @@ -660,7 +660,7 @@ def update_statistic_with_global_ids(self, assert self.func_called_count["update_statistic_with_local_ids"] == 0 self.func_called_count["update_statistic_with_global_ids"] += 1 if self.updates_enabled: - if is_graph_capturing(): + if do_multi_stream(): self.event_dict[EventType.Main].record() with torch.cuda.stream(self.aux_stream): self.event_dict[EventType.Main].wait() @@ -851,8 +851,8 @@ def set_warm_up_iter_count(self, iter_count: int): """ self.load_balancer_impl.set_warm_up_iter_count(iter_count) - def set_next_iter_info(self, enable_statistic: Optional[bool], - enable_update_weights: Optional[bool]): + def set_iter_info(self, enable_statistic: Optional[bool], + enable_update_weights: Optional[bool]): if enable_statistic is not None: self.enable_statistic = enable_statistic if enable_update_weights is not None: @@ -998,8 +998,8 @@ def __enter__(self): """ if self.moe_load_balancer is not None and not self.moe_load_balancer.is_static_routing( ): - self.moe_load_balancer.set_next_iter_info(self.enable_statistic, - self.enable_updates) + self.moe_load_balancer.set_iter_info(self.enable_statistic, + self.enable_updates) self.moe_load_balancer.start_iter() return self diff --git a/tensorrt_llm/_torch/modules/multi_stream_utils.py b/tensorrt_llm/_torch/modules/multi_stream_utils.py index e91b7eac245..c7b58c0896b 100644 --- a/tensorrt_llm/_torch/modules/multi_stream_utils.py +++ b/tensorrt_llm/_torch/modules/multi_stream_utils.py @@ -1,8 +1,35 @@ +import threading +from contextlib import contextmanager from typing import Any, Callable, Optional import torch -from ..pyexecutor.cuda_graph_runner import is_graph_capturing + +class do_multi_stream_local(threading.local): + + def __init__(self): + self.do_multi_stream = False + + +_local = do_multi_stream_local() + + +def set_do_multi_stream(enable: bool): + _local.do_multi_stream = enable + + +def do_multi_stream() -> bool: + return _local.do_multi_stream + + +@contextmanager +def with_multi_stream(enable: bool): + prev_do_multi_stream = _local.do_multi_stream + set_do_multi_stream(enable) + try: + yield + finally: + set_do_multi_stream(prev_do_multi_stream) def maybe_execute_in_parallel( @@ -30,9 +57,9 @@ def maybe_execute_in_parallel( tuple[Any, Any]: the return values of fn0() and fn1() """ - do_multi_stream = is_graph_capturing() and aux_stream is not None + multi_stream = do_multi_stream() and aux_stream is not None - if do_multi_stream: + if multi_stream: event0.record() result0 = fn0() diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index ca224f0b2cd..fed6a715374 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -242,8 +242,8 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None: torch_used_bytes = torch.cuda.memory_stats( )["allocated_bytes.all.current"] finally: - py_executor.shutdown() py_executor.is_warmup = False + py_executor.shutdown() py_executor.enable_iter_perf_stats = origin_iter_stats py_executor.set_gather_responses(False) diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index 631f974db26..c656aac8c60 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -79,6 +79,7 @@ class PyTorchConfig: torch_compile_fullgraph: bool = True torch_compile_inductor_enabled: bool = False torch_compile_piecewise_cuda_graph: bool = False + torch_compile_piecewise_cuda_graph_num_tokens: Optional[List[int]] = None # When torch compile is enabled, userbuffers is enabled by default torch_compile_enable_userbuffers: bool = True torch_compile_max_num_streams: int = 1 diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py index 50306d66a66..df674a94968 100644 --- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py +++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py @@ -1,28 +1,11 @@ -import threading from typing import Any, Callable, Dict, Optional, Tuple import torch from ..attention_backend.interface import AttentionMetadata +from ..modules.multi_stream_utils import with_multi_stream from ..speculative.interface import SpecMetadata -from ..utils import make_weak_ref, set_piecewise_cuda_graph_flag - - -class graph_capturing_local(threading.local): - - def __init__(self): - self.is_graph_capturing = False - - -_local = graph_capturing_local() - - -def set_graph_capturing(enable: bool): - _local.is_graph_capturing = enable - - -def is_graph_capturing() -> bool: - return _local.is_graph_capturing +from ..utils import make_weak_ref, piecewise_cuda_graph class DecodingCUDAGraphRunner: @@ -97,14 +80,11 @@ def capture( # internal states according to the docs: # https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics # This also lets us initialize states in the attn_metadata. - set_graph_capturing(True) - set_piecewise_cuda_graph_flag(False) - for _ in range(2): - forward_fn(inputs) - with torch.cuda.graph(self._graph, pool=pool): - output = forward_fn(inputs) - set_graph_capturing(False) - set_piecewise_cuda_graph_flag(True) + with with_multi_stream(True), piecewise_cuda_graph(False): + for _ in range(2): + forward_fn(inputs) + with torch.cuda.graph(self._graph, pool=pool): + output = forward_fn(inputs) # Mark weak ref here. The output tensor should be freed properly. self._output = make_weak_ref(output) return self._graph.pool() diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 22a53c4666f..a34f03edb55 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -40,7 +40,7 @@ from ..attention_backend.vanilla import VanillaAttentionMetadata from ..autotuner import AutoTuner, autotune from ..compilation.backend import Backend -from ..compilation.utils import set_enable_piecewise_cuda_graph_capture_flag +from ..compilation.utils import capture_piecewise_cuda_graph from ..distributed import MPIDist from ..distributed.communicator import init_pp_comm from ..expert_statistic import ExpertStatistic @@ -293,8 +293,6 @@ def __init__( self.enable_spec_decode = self.is_spec_decode self.is_draft_model = is_draft_model - self.in_warmup = False - self.attn_runtime_features = attn_runtime_features or AttentionRuntimeFeatures( ) @@ -335,6 +333,15 @@ def __init__( pytorch_backend_config.torch_compile_piecewise_cuda_graph and not self.enable_attention_dp) + piecewise_cuda_graph_num_tokens = ( + pytorch_backend_config.torch_compile_piecewise_cuda_graph_num_tokens + or pytorch_backend_config.cuda_graph_batch_sizes or []) + + self._piecewise_cuda_graph_num_tokens = [ + i for i in piecewise_cuda_graph_num_tokens + if i <= self.max_num_tokens + ] + try: use_ub_for_nccl = ( pytorch_backend_config.allreduce_strategy == "NCCL_SYMMETRIC" @@ -349,8 +356,7 @@ def __init__( enable_userbuffers=use_ub, enable_piecewise_cuda_graph=self. _torch_compile_piecewise_cuda_graph, - cuda_graph_batch_sizes=pytorch_backend_config. - cuda_graph_batch_sizes, + capture_num_tokens=self._piecewise_cuda_graph_num_tokens, max_num_streams=pytorch_backend_config. torch_compile_max_num_streams) if isinstance(self.model, DecoderModelForCausalLM): @@ -373,6 +379,8 @@ def __init__( traceback.print_exception(Exception, e, e.__traceback__) raise e + self.is_warmup = False + self.attn_backend = get_attention_backend(attn_backend) if self.is_spec_decode: @@ -478,17 +486,44 @@ def use_mrope(self): logger.debug(f"Detected use_mrope: {use_mrope}") return use_mrope + @property + def is_warmup(self): + return getattr(self, "_is_warmup", False) + + @is_warmup.setter + def is_warmup(self, value: bool): + self._is_warmup = value + + self.moe_load_balancer_iter_info = (not value, not value) + + @property + def moe_load_balancer_iter_info(self): + moe_load_balancer: MoeLoadBalancer = getattr(self, 'moe_load_balancer', + None) + if moe_load_balancer is not None: + return moe_load_balancer.enable_statistic, moe_load_balancer.enable_update_weights + return False, False + + @moe_load_balancer_iter_info.setter + def moe_load_balancer_iter_info(self, value: Tuple[bool, bool]): + moe_load_balancer: MoeLoadBalancer = getattr(self, 'moe_load_balancer', + None) + if moe_load_balancer is not None: + moe_load_balancer.set_iter_info(enable_statistic=value[0], + enable_update_weights=value[1]) + @property def use_beam_search(self): return self.max_beam_width > 1 @contextmanager def set_warmup_flag(self): - self.in_warmup = True + prev_is_warmup = self.is_warmup + self.is_warmup = True try: yield finally: - self.in_warmup = False + self.is_warmup = prev_is_warmup @staticmethod def with_warmup_flag(method): @@ -669,120 +704,110 @@ def release_batch(result: ScheduledRequests | None): if cp_type == CpType.STAR: return - with contextlib.ExitStack() as stack: - if self._torch_compile_enabled: - - def disable_optimization(backend: Backend): - # Disable torch.compile optimization and fallback to eager execution - backend.bypass_optimization() - # Disable piecewise CUDA graph capture since the capture run will produce wrong results - set_enable_piecewise_cuda_graph_capture_flag(False) - - stack.callback(disable_optimization, - self._torch_compile_backend) - - self._torch_compile_backend.enable_optimization() - - # Disable cuda graph capture here so that we can properly capture it later - with self.no_cuda_graph(): - available_tokens = kv_cache_manager.get_num_available_tokens( - self.runtime_draft_len) - warmup_batch_size = [1, self.batch_size // 2] - if self.batch_size < 2: - warmup_batch_size = [1] - for bs in warmup_batch_size: - for num_tokens_per_request in [ - 1, - min(self.max_num_tokens // max(bs, 1), - min(available_tokens, self.max_seq_len - 1)) - ]: - with release_batch( - get_torch_compile_warmup_request( - bs, num_tokens_per_request)) as batch: - if batch is None: - # No KV cache space! - continue - logger.info( - f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request > 1 else 'generation'} phase" - ) - self.forward(batch, - new_tensors_device=None, - resource_manager=resource_manager) - torch.cuda.synchronize() - - if self.pytorch_backend_config.enable_autotuner: - with self.no_cuda_graph(), autotune(): - result = get_autotune_warmup_request() - with release_batch(result) as batch: - if batch is None: - # No KV cache space! - pass - else: + if self._torch_compile_enabled: + + # Disable cuda graph capture here so that we can properly capture it later + with self.no_cuda_graph(): + available_tokens = kv_cache_manager.get_num_available_tokens( + self.runtime_draft_len) + warmup_batch_size = [1, self.batch_size // 2] + if self.batch_size < 2: + warmup_batch_size = [1] + for bs in warmup_batch_size: + for num_tokens_per_request in [ + 1, + min(self.max_num_tokens // max(bs, 1), + min(available_tokens, self.max_seq_len - 1)) + ]: + with release_batch( + get_torch_compile_warmup_request( + bs, num_tokens_per_request)) as batch: + if batch is None: + # No KV cache space! + continue + logger.info( + f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request > 1 else 'generation'} phase" + ) self.forward(batch, new_tensors_device=None, resource_manager=resource_manager) torch.cuda.synchronize() - logger.info( - f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}" - ) - - AutoTuner.get().print_profiling_cache() - - if not (self._run_cuda_graphs - or self._torch_compile_piecewise_cuda_graph): - return - - logger.info( - f"Creating CUDA graph instances for {len(self._cuda_graph_batch_sizes)} batch sizes." - ) - # Reverse the order of the cuda graph batch sizes to make smaller batch size graph could reuse larger batch size graph memory - cuda_graph_batch_sizes = sorted(self._cuda_graph_batch_sizes, - reverse=True) - # Create CUDA graphs for different draft lengths - draft_lengths = [self.max_draft_len] - # For non-draft model, we also capture the CUDA graph instance for draft length 0, - # so that when we disable spec decode at runtime, we can still run the captured graph. - # Note that for one engine mode, we are not able to turn off spec decode at runtime. - if (not self.is_draft_model and self.max_draft_len > 0 - and not self.spec_config.spec_dec_mode.use_one_engine() - # Assume that speculation is always on if the user didn't give us a max_concurrency - # value. This will save on memory. - and self.spec_config.max_concurrency is not None): - draft_lengths.append(0) - - for bs in cuda_graph_batch_sizes: - if bs > self.batch_size: - # skip batch size larger than self.batch_size - continue - - for draft_len in draft_lengths: - with release_batch( - get_cuda_graph_warmup_request(bs, - draft_len)) as batch: - if batch is None: - # No KV cache space! - return - logger.info( - f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}" - ) - self.enable_spec_decode = draft_len > 0 or self.is_draft_model + if self.pytorch_backend_config.enable_autotuner: + with self.no_cuda_graph(), autotune(): + result = get_autotune_warmup_request() + with release_batch(result) as batch: + if batch is None: + # No KV cache space! + pass + else: self.forward(batch, new_tensors_device=None, resource_manager=resource_manager) torch.cuda.synchronize() - if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled: - for seq_lens in cuda_graph_batch_sizes: - set_enable_piecewise_cuda_graph_capture_flag(True) + logger.info( + f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}" + ) + + AutoTuner.get().print_profiling_cache() + + if not (self._run_cuda_graphs + or self._torch_compile_piecewise_cuda_graph): + return + + logger.info( + f"Creating CUDA graph instances for {len(self._cuda_graph_batch_sizes)} batch sizes." + ) + # Reverse the order of the cuda graph batch sizes to make smaller batch size graph could reuse larger batch size graph memory + cuda_graph_batch_sizes = sorted(self._cuda_graph_batch_sizes, + reverse=True) + # Create CUDA graphs for different draft lengths + draft_lengths = [self.max_draft_len] + # For non-draft model, we also capture the CUDA graph instance for draft length 0, + # so that when we disable spec decode at runtime, we can still run the captured graph. + # Note that for one engine mode, we are not able to turn off spec decode at runtime. + if (not self.is_draft_model and self.max_draft_len > 0 + and not self.spec_config.spec_dec_mode.use_one_engine() + # Assume that speculation is always on if the user didn't give us a max_concurrency + # value. This will save on memory. + and self.spec_config.max_concurrency is not None): + draft_lengths.append(0) + + for bs in cuda_graph_batch_sizes: + if bs > self.batch_size: + # skip batch size larger than self.batch_size + continue + + for draft_len in draft_lengths: + with release_batch(get_cuda_graph_warmup_request( + bs, draft_len)) as batch: + if batch is None: + # No KV cache space! + return + logger.info( + f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}" + ) + self.enable_spec_decode = draft_len > 0 or self.is_draft_model + self.forward(batch, + new_tensors_device=None, + resource_manager=resource_manager) + torch.cuda.synchronize() + + if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled: + piecewise_cuda_graph_num_tokens = sorted( + self._piecewise_cuda_graph_num_tokens, reverse=True) + + with capture_piecewise_cuda_graph(True): + for num_tokens in piecewise_cuda_graph_num_tokens: with self.no_cuda_graph(): with release_batch( get_torch_compile_warmup_request( - 1, seq_lens)) as batch: + 1, num_tokens)) as batch: logger.info( - f"Run piecewise CUDA graph warmup for seq_lens={seq_lens}" + f"Run piecewise CUDA graph warmup for num tokens={num_tokens}" ) - # self.model.mtp_worker.stored_input_ids = [] + for _ in range(3): self.forward(batch, new_tensors_device=None, @@ -793,7 +818,6 @@ def disable_optimization(backend: Backend): torch.cuda.synchronize() gc.collect() torch.cuda.empty_cache() - set_enable_piecewise_cuda_graph_capture_flag(False) # Set the value back to the original value self.enable_spec_decode = self.is_spec_decode @@ -1541,7 +1565,7 @@ def previous_seq_slots_device(): # Cache indirection is only used for beam search on generation requests if self.use_beam_search and num_generation_requests > 0: # CUDA Graph needs to set beam width during warmup (where the graph is captured), to ensure that cache indirection buffer is correctly picked up by the CUDA graph - is_cuda_graph_during_warmup = self.in_warmup and attn_metadata.is_cuda_graph + is_cuda_graph_during_warmup = self.is_warmup and attn_metadata.is_cuda_graph if cache_indirection_buffer is not None: #Copy cache indirection to local buffer with offsets changing: seq_slots[i] -> i self.cache_indirection_attention[:num_generation_requests].copy_( @@ -2151,14 +2175,8 @@ def forward( spec_resource_manager = None spec_metadata = None - moe_load_balancer = None - if hasattr(self, 'moe_load_balancer'): - moe_load_balancer = getattr(self, 'moe_load_balancer') - if not self.in_warmup: - moe_enable_statistic = True - moe_enable_update = True - moe_load_balancer.set_next_iter_info(moe_enable_statistic, - moe_enable_update) + moe_load_balancer: MoeLoadBalancer = getattr(self, 'moe_load_balancer', + None) if kv_cache_manager is None: inputs, gather_ids = self._prepare_tp_inputs_no_cache( diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 4573528e314..8dbbe39abb6 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -161,7 +161,6 @@ def __init__(self, self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes( PROFILE_START_STOP_ENV_VAR_NAME) self.gc_nvtx_watcher_handle = _gc_nvtx_watcher() - self.is_warmup = False # During warmup, we don't enable the profiler # related modules self.resource_manager = resource_manager @@ -220,9 +219,12 @@ def __init__(self, self.inflight_req_ids = ReqIdsSet() + # During warmup, we don't enable the profiler + self.is_warmup = True self.model_engine.warmup(self.resource_manager) if self.draft_model_engine is not None: self.draft_model_engine.warmup(self.resource_manager) + self.is_warmup = False self.is_shutdown = False self.max_batch_size = max_batch_size @@ -280,6 +282,18 @@ def _event_loop_wrapper(self): finally: self._executor_loop_cleanup() + @property + def is_warmup(self) -> bool: + return getattr(self, "_is_warmup", False) + + @is_warmup.setter + def is_warmup(self, value: bool): + self._is_warmup = value + # Set warmup flag in model engine to trigger torch compile and avoid moe load balancer statistics update + self.model_engine.is_warmup = value + if self.draft_model_engine is not None: + self.draft_model_engine.is_warmup = value + def start_worker(self): with self.worker_lock: if self.worker_started == False: diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index 932aa7e7e67..4068ad44a60 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -265,3 +265,13 @@ def set_piecewise_cuda_graph_flag(enable: bool): def get_piecewise_cuda_graph_flag() -> bool: global _enable_piecewise_cuda_graph return _enable_piecewise_cuda_graph + + +@contextlib.contextmanager +def piecewise_cuda_graph(enable: bool): + prev_enable = get_piecewise_cuda_graph_flag() + set_piecewise_cuda_graph_flag(enable) + try: + yield + finally: + set_piecewise_cuda_graph_flag(prev_enable) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index abc41b00356..948c4b16883 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1990,6 +1990,21 @@ class TorchCompileConfig(StrictBaseModel): default=False, description="Enable piecewise CUDA graph in torch.compile.") + capture_num_tokens: Optional[List[int]] = Field( + default=None, + description= + "List of num of tokens to capture the piecewise CUDA graph for. If not provided, the number of tokens will be the same as cuda_graph_config.batch_sizes." + ) + + @field_validator('capture_num_tokens') + @classmethod + def validate_capture_num_tokens(cls, v): + if v is None: + return v + if any(t <= 0 for t in v): + raise ValueError("capture_num_tokens must contain positive ints.") + return sorted(set(v), reverse=True) + enable_userbuffers: bool = Field( default=True, description= @@ -2368,6 +2383,10 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig": enable_piecewise_cuda_graph if self.torch_compile_config is not None else TorchCompileConfig. model_fields['enable_piecewise_cuda_graph'].default, + torch_compile_piecewise_cuda_graph_num_tokens=self. + torch_compile_config.capture_num_tokens + if self.torch_compile_config is not None else + TorchCompileConfig.model_fields['capture_num_tokens'].default, torch_compile_enable_userbuffers=self.torch_compile_config. enable_userbuffers if self.torch_compile_config is not None else TorchCompileConfig.model_fields['enable_userbuffers'].default, diff --git a/tests/unittest/_torch/modules/test_moe_load_balancer.py b/tests/unittest/_torch/modules/test_moe_load_balancer.py index 66edbd6d17e..969fb04d030 100644 --- a/tests/unittest/_torch/modules/test_moe_load_balancer.py +++ b/tests/unittest/_torch/modules/test_moe_load_balancer.py @@ -269,7 +269,7 @@ def test_moe_load_balancer_lifecycle_methods(self, mock_load_balancer_impl): mock_load_balancer_impl.return_value.set_warm_up_iter_count.assert_called_once_with( 10) - balancer.set_next_iter_info(True, True) + balancer.set_iter_info(True, True) with MoeLoadBalancerIterContext(balancer): mock_load_balancer_impl.return_value.start_iter.assert_called_once_with( @@ -308,7 +308,7 @@ def test_real_statistic_kernel(self): balancer.finalize_model() # enable statistic, disable weight update - balancer.set_next_iter_info(True, False) + balancer.set_iter_info(True, False) # Create sample token data - each token selects 2 experts # 4 tokens, each selecting 2 experts @@ -373,7 +373,7 @@ def test_real_routing_kernel(self): balancer.finalize_model() # enable statistic, disable weight update - balancer.set_next_iter_info(True, False) + balancer.set_iter_info(True, False) # Create sample token data - tokens selecting different experts token_selected_experts = torch.tensor( From 06911c0173f98d372d3ec282a6f773b091f08ed2 Mon Sep 17 00:00:00 2001 From: Venky <23023424+venkywonka@users.noreply.github.com> Date: Mon, 18 Aug 2025 19:11:36 -0700 Subject: [PATCH 14/20] [None] [infra] stricter coderabbit pr title generation instructions (#6918) Signed-off-by: Venky Ganesh <23023424+venkywonka@users.noreply.github.com> --- .coderabbit.yaml | 2 +- .github/pull_request_template.md | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.coderabbit.yaml b/.coderabbit.yaml index a7d05d70d51..1644fad2728 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -20,7 +20,7 @@ language: "en-US" reviews: profile: chill auto_title_placeholder: '@coderabbitai title' - auto_title_instructions: 'Should follow the format: "[fix/feat/doc/infra/...] \". Keep it concise.' + auto_title_instructions: 'Format: "[] ". Category must be one of: fix, feat, doc, infra, style, refactor, perf, test, chore, revert. Enclose the category in square brackets. Title should be concise (<= 60 chars). Example: "[feat] Add logit_bias support".' commit_status: false collapse_walkthrough: true assess_linked_issues: true diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 45f9ebf7f12..4665a9682a3 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -18,6 +18,14 @@ Examples: - [https://nvbugs/1234567][fix] Fix some bugs - [#1234][doc] Update documentation - [None][chore] Minor clean-up + +Alternative (faster) way using CodeRabbit AI: + +**[JIRA ticket/NVBugs ID/GitHub issue/None] @coderabbitai title** + +NOTE: "@coderabbitai title" will be replaced by the title generated by CodeRabbit AI, that includes the "[type]" and title. +For more info, see /.coderabbit.yaml. + --> ## Description From 2bb90ba002a6c794b7b5b5ed3c0cc4b09a61dfb4 Mon Sep 17 00:00:00 2001 From: Zhenhuan Chen <chenzhh3671@gmail.com> Date: Tue, 19 Aug 2025 10:18:04 +0800 Subject: [PATCH 15/20] [TRTLLM-6960][fix] enable scaled_mm tests (#6936) Signed-off-by: Zhenhuan Chen <chenzhh3671@gmail.com> --- tests/unittest/_torch/thop/test_scaled_mm.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/unittest/_torch/thop/test_scaled_mm.py b/tests/unittest/_torch/thop/test_scaled_mm.py index 31149de7310..40dfa4e0cb4 100644 --- a/tests/unittest/_torch/thop/test_scaled_mm.py +++ b/tests/unittest/_torch/thop/test_scaled_mm.py @@ -38,11 +38,6 @@ [torch.float16, torch.float32, torch.bfloat16], ) def test_fp8_scaled_mm(output_dtype, m, k_n): - if getSMVersion() == 90: - pytest.skip( - "Skip test for sm90 because it's too flaky. https://nvbugspro.nvidia.com/bug/5441734" - ) - k, n = k_n torch.random.manual_seed(0) shape_x = (m, k) @@ -76,7 +71,7 @@ def test_fp8_scaled_mm(output_dtype, m, k_n): os.environ["CUBLASLT_WORKSPACE_SIZE"] = old_env np.testing.assert_allclose(ref.float().cpu(), output.float().cpu(), - atol=1, + atol=0.01, rtol=0.01) if getSMVersion() == 90: From c49f0f1bc81f989f129f40e7fcde2938526c6bcb Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Tue, 19 Aug 2025 11:52:24 +0800 Subject: [PATCH 16/20] add llmapi trt flow test case with cuda graph and generate logits Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api.py | 14 +++++++++++++- .../test_lists/qa/llm_function_full.txt | 1 + tests/integration/test_lists/waives.txt | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index f34bcdb5be4..8f4851472a0 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -15,7 +15,8 @@ import pytest from tensorrt_llm._tensorrt_engine import LLM -from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig +from tensorrt_llm.llmapi import (EagleDecodingConfig, + ExtendedRuntimePerfKnobConfig, KvCacheConfig) from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization import QuantAlgo @@ -76,6 +77,17 @@ def test_guided_decoding_4gpus(self, backend: str): task = JsonModeEval(self.MODEL_NAME) task.evaluate(llm) + def test_gather_generation_logits_cuda_graph(self): + extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig( + cuda_graph_mode=True, cuda_graph_cache_size=1) + llm = LLM( + self.MODEL_PATH, + gather_generation_logits=True, + extended_runtime_perf_knob_config=extended_runtime_perf_knob_config) + with llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + class TestLlama3_2_1B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.2-1B" diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 1859762fc17..29b915c214b 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -420,6 +420,7 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4 accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] +accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 49c9a6d0107..7c1a5bb8693 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -315,3 +315,5 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817) llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796) disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384) +accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525) From e395958562770e26e6d2f1c9939e59206047136b Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Mon, 21 Jul 2025 16:20:43 +0800 Subject: [PATCH 17/20] add llmapi logprobs test with trt flow Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../defs/accuracy/references/cnn_dailymail.yaml | 2 ++ tests/integration/defs/accuracy/test_llm_api.py | 12 +++++++++++- .../integration/test_lists/qa/llm_function_full.txt | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 67781cd8d15..fa0d1b32f54 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -124,6 +124,8 @@ meta-llama/Llama-3.1-8B-Instruct: - accuracy: 33.640 - spec_dec_algo: Eagle accuracy: 33.640 + - extra_acc_spec: logprobs=2 + accuracy: 30.522 - quant_algo: FP8 accuracy: 33.841 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index 8f4851472a0..24974238b99 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -16,7 +16,8 @@ from tensorrt_llm._tensorrt_engine import LLM from tensorrt_llm.llmapi import (EagleDecodingConfig, - ExtendedRuntimePerfKnobConfig, KvCacheConfig) + ExtendedRuntimePerfKnobConfig, KvCacheConfig, + SamplingParams) from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization import QuantAlgo @@ -88,6 +89,15 @@ def test_gather_generation_logits_cuda_graph(self): task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + def test_logprobs(self): + sampling_config = SamplingParams(logprobs=2) + llm = LLM(self.MODEL_PATH, gather_generation_logits=True) + with llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm, + sampling_params=sampling_config, + extra_acc_spec="logprobs=2") + class TestLlama3_2_1B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.2-1B" diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 29b915c214b..2aab7e288aa 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -421,6 +421,7 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph +accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise From 117bd93214287cb600d31de4a6ce5e4188d23f84 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Tue, 22 Jul 2025 14:21:00 +0800 Subject: [PATCH 18/20] add llmapi trt flow phi-4-mini-instruct acc test Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../accuracy/references/cnn_dailymail.yaml | 4 ++++ .../defs/accuracy/references/mmlu.yaml | 2 ++ .../integration/defs/accuracy/test_llm_api.py | 20 +++++++++++++++++++ .../test_lists/qa/llm_function_full.txt | 2 ++ 4 files changed, 28 insertions(+) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index fa0d1b32f54..2c2094b6d4a 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -232,6 +232,10 @@ mistralai/Mixtral-8x22B-v0.1: accuracy: 25.519 - quant_algo: W8A16 accuracy: 27.427 +microsoft/Phi-4-mini-instruct: + - accuracy: 32.958 + - quant_algo: FP8 + accuracy: 32.823 google/gemma-2b: - accuracy: 23.194 - quant_algo: W8A16 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 7f2bb55e6f7..b465940f2bb 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -229,6 +229,8 @@ nvidia/Nemotron-H-56B-Base-8K: accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 + - quant_algo: FP8 + accuracy: 68.30 # Created a dummy accuracy to track tp_size=2 for phi4-mini model. # TODO: update once https://nvbugs/5393849 is fixed. microsoft/Phi-4-mini-instruct-tp2: diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index 24974238b99..fb535b47396 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -266,6 +266,26 @@ def test_awq_tp2(self): task.evaluate(llm) +class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): + MODEL_NAME = "microsoft/Phi-4-mini-instruct" + MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct" + + def test_auto_dtype(self): + with LLM(self.MODEL_PATH) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + def test_fp8(self): + quant_config = QuantConfig(QuantAlgo.FP8) + with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2-7B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct" diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 2aab7e288aa..cf3eec8eeb3 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -422,6 +422,8 @@ accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs +accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_auto_dtype +accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise From ead824e59c08a5a9e87be9b3a8190d8b0a04bbb3 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:00:29 +0800 Subject: [PATCH 19/20] add nemo 12b base test cases Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../accuracy/references/cnn_dailymail.yaml | 9 ++-- .../defs/accuracy/references/mmlu.yaml | 4 ++ .../integration/defs/accuracy/test_llm_api.py | 43 ++++++++++++++++--- .../defs/accuracy/test_llm_api_pytorch.py | 30 +++++++++++++ .../test_lists/qa/llm_function_full.txt | 6 ++- 5 files changed, 81 insertions(+), 11 deletions(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 2c2094b6d4a..93e025b2d8b 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct: - accuracy: 31.354 microsoft/Phi-4-mini-instruct: - accuracy: 32.921 + - quant_algo: FP8 + accuracy: 32.823 state-spaces/mamba-130m-hf: - accuracy: 19.470 lmsys/vicuna-7b-v1.3: @@ -201,7 +203,8 @@ mistralai/Mistral-7B-Instruct-v0.3: accuracy: 31.201 mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 29.20 -mistralai/Mistral-Nemo-Base-2407: +mistralai/Mistral-Nemo-12b-Base: + - accuracy: 28.906 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 24.0 @@ -232,10 +235,6 @@ mistralai/Mixtral-8x22B-v0.1: accuracy: 25.519 - quant_algo: W8A16 accuracy: 27.427 -microsoft/Phi-4-mini-instruct: - - accuracy: 32.958 - - quant_algo: FP8 - accuracy: 32.823 google/gemma-2b: - accuracy: 23.194 - quant_algo: W8A16 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index b465940f2bb..b6c38930a15 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -266,3 +266,7 @@ GPT-OSS/MXFP4: accuracy: 75.50 - quant_algo: W4A8_MXFP4_FP8 accuracy: 75.50 +mistralai/Mistral-Nemo-12b-Base: + - accuracy: 69.66 + - quant_algo: FP8 + accuracy: 69.66 diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index fb535b47396..602ed91dbf6 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -79,6 +79,7 @@ def test_guided_decoding_4gpus(self, backend: str): task.evaluate(llm) def test_gather_generation_logits_cuda_graph(self): + """RCCA: https://nvbugs/5365525""" extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig( cuda_graph_mode=True, cuda_graph_cache_size=1) llm = LLM( @@ -199,18 +200,49 @@ def test_quant_tp4(self, quant): task.evaluate(llm) -class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness): - MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407" +class TestMistralNemo12B(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base" MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407" + @pytest.mark.skip_less_device_memory(80000) + def test_auto_dtype(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + + with LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + def test_auto_dtype_tp2(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + + with LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + tensor_parallel_size=2, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip_less_device_memory(80000) @skip_pre_ada def test_fp8(self): - quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, + quant_config = QuantConfig(QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) - with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: + with LLM(self.MODEL_PATH, + quant_config=quant_config, + kv_cache_config=kv_cache_config, + max_batch_size=8) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness): @@ -277,6 +309,7 @@ def test_auto_dtype(self): task = MMLU(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_ada def test_fp8(self): quant_config = QuantConfig(QuantAlgo.FP8) with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: @@ -420,7 +453,7 @@ def test_fp8(self): @skip_pre_ada def test_fp8_kvcache(self): "RCCA: https://nvbugs/5065080" - quant_config = QuantConfig(QuantAlgo.FP8, + quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) with LLM(self.MODEL_PATH, quant_config=quant_config) as llm: task = CnnDailymail(self.MODEL_NAME) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 89483fd2620..c6db6076f7c 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1766,6 +1766,36 @@ def test_auto_dtype_tp8(self): task.evaluate(llm) +class TestMistralNemo12B(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base" + MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407" + + @pytest.mark.skip_less_device_memory(80000) + def test_auto_dtype(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + + with LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip_less_device(2) + def test_auto_dtype_tp2(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + + with LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + tensor_parallel_size=2, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.timeout(5400) @pytest.mark.skip_less_device_memory(80000) class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness): diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index cf3eec8eeb3..8b19cabd605 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -435,7 +435,9 @@ accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4] accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq] accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq] -accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 +accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype +accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype_tp2 +accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8 accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2 accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2 @@ -579,6 +581,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend +accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] From 26a83d6fdb78185305413d5baa2b0ab05c5f73f9 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:32:29 +0800 Subject: [PATCH 20/20] fix invalid test name Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- tests/integration/test_lists/test-db/l0_h100.yml | 2 +- tests/integration/test_lists/waives.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 64f6498d095..a52b515e644 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -253,7 +253,7 @@ l0_h100: - examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] - examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] # 5 mins - accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8 - - accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 + - accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 - examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins - examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] - examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 7c1a5bb8693..c4733b9b36a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -301,7 +301,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489) disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504) -accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197) +accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197) triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349) triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)