From f277afdd93365809dbb76ced50be0cd2f9db55c4 Mon Sep 17 00:00:00 2001 From: Daniel Stokes <40156487+djns99@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:15 +1200 Subject: [PATCH 01/88] perf: Enable 128x256 tile shapes for FP4 MOE CUTLASS backend (#5986) Signed-off-by: Daniel Stokes <40156487+djns99@users.noreply.github.com> --- .../kernels/cutlass_kernels/cutlass_heuristic.cpp | 6 ++++-- .../moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl | 9 +++++++-- .../moe_gemm/moe_gemm_template_dispatch_tma_ws.h | 2 +- .../kernels/cutlass_kernels/python/generate_kernels.py | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index 47d95589b77..9e3bbaa32b7 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -383,8 +383,10 @@ std::vector get_candidate_configs_sm100(CutlassGemmConfig::Ca MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1}); candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x128x128B, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1}); - // candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B, - // MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1}); + candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B, + MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1}); + candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x256x128B, + MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1}); candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x2x1}); candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x64x128B, diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl index fe35f690a9e..d5f0b198fd8 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl @@ -342,11 +342,16 @@ using SafeBF16 = void; using EpilogueTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{})); \ using EpilogueTileShape = std::conditional_t; \ using EpilogueElementC = std::conditional_t; \ + using EpilogueTensorOp = std::conditional_t; \ + using EpilogueSubTile \ + = std::conditional_t, cutlass::epilogue::collective::EpilogueTileAuto>; \ /* Epilogue For Default Finalize */ \ using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::CollectiveBuilder Date: Tue, 15 Jul 2025 00:49:42 +0200 Subject: [PATCH 02/88] [nvbugs-5318143] fix: restrict PyTorch memory usage to avoid OOMs (#5964) Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com> --- docker/Dockerfile.multi | 3 ++ tensorrt_llm/_torch/pyexecutor/_util.py | 44 +++++++++++++++++++ tensorrt_llm/_torch/pyexecutor/config.py | 5 +++ .../_torch/pyexecutor/py_executor_creator.py | 6 ++- tests/integration/test_lists/waives.txt | 2 - 5 files changed, 56 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index a8ee5da002e..da67d0a4994 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -66,6 +66,9 @@ RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.s ARG TORCH_INSTALL_TYPE="skip" COPY docker/common/install_pytorch.sh install_pytorch.sh RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh +# +# NB: PyTorch requires this to be < 1.0 +ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" # Install OpenCV with FFMPEG support RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/ diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 6e969a8d1de..979bc83f218 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -1,3 +1,4 @@ +import os import random from collections.abc import Iterable from typing import Dict, List, Optional @@ -18,6 +19,7 @@ from ..model_config import ModelConfig from ..speculative import get_spec_decoder +from .config import PyTorchConfig from .config_utils import is_mla, is_nemotron_hybrid from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver from .llm_request import ExecutorResponse @@ -718,3 +720,45 @@ def _try_infer_num_experts(model_config: ModelConfig) -> int: return 1 return num_experts + + +def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig): + # FIXME: PyTorch only uses the garbage_collection_threshold setting + # if a memory fraction is set, cf. + # https://github.com/pytorch/pytorch/blob/cd995bfb2aac8891465809be3ce29543bd524287/c10/cuda/CUDACachingAllocator.cpp#L1357 + logger.debug("Setting PyTorch memory fraction to 1.0") + torch.cuda.set_per_process_memory_fraction(1.0) + + # FIXME: As soon as + # torch.cuda._set_allocator_settings (added in PyTorch 2.8.0-rc1) + # or a similar API is available, the warning below should be removed + # and the allocator GC threshold be set via the new API instead. + torch_allocator_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + torch_mem_threshold_advised = ( + torch.cuda.get_allocator_backend() == "native" + and "expandable_segments:True" not in torch_allocator_config) + torch_mem_threshold_set = "garbage_collection_threshold:" in torch_allocator_config + if torch_mem_threshold_advised and not torch_mem_threshold_set: + logger.warning( + "It is recommended to incl. 'garbage_collection_threshold:0.???' or 'backend:cudaMallocAsync'" + " or 'expandable_segments:True' in PYTORCH_CUDA_ALLOC_CONF.") + + # NOTE: Even if a memory threshold was not set (cf. warning above), setting a memory + # fraction < 1.0 is beneficial, because + # https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L2719 + # and + # https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L1240 + # lead PyTorch to release all unused memory before hitting the set fraction. This + # still mitigates OOM, although at a higher performance impact, because it + # effectively resets the allocator cache. + if not pytorch_backend_config._limit_torch_cuda_mem_fraction: + return + mem_reserved = torch.cuda.memory_reserved() + mem_free, mem_total = torch.cuda.mem_get_info() + safety_margin = 32 * 1024**2 + mem_torch_max = mem_free + mem_reserved - safety_margin + mem_torch_fraction = mem_torch_max / mem_total + logger.info( + f"Setting PyTorch memory fraction to {mem_torch_fraction} ({mem_torch_max / 1024**3} GiB)" + ) + torch.cuda.set_per_process_memory_fraction(mem_torch_fraction) diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index 19278089677..b1935a51234 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -92,6 +92,11 @@ class PyTorchConfig: force_dynamic_quantization: bool = False + # If true, adjust PyTorch CUDA memory fraction to correspond to the + # total GPU memory minus the statically allocated engine memory. + # If false, set the PyTorch CUDA memory fraction to 1.0. + _limit_torch_cuda_mem_fraction: bool = True + EXETENDED_EXECUTOR_CONFIG_FIELDS = [ 'backend', diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index a72f6a58b12..b6893d69e26 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -20,8 +20,8 @@ from ..attention_backend.interface import AttentionRuntimeFeatures from ..distributed import MPIDist from ..speculative import get_spec_drafter, get_spec_resource_manager -from ._util import (KvCacheCreator, create_py_executor_instance, - instantiate_sampler, is_mla) +from ._util import (KvCacheCreator, _adjust_torch_mem_fraction, + create_py_executor_instance, instantiate_sampler, is_mla) from .config import PyTorchConfig from .config_utils import is_mla from .model_engine import PyTorchModelEngine @@ -432,5 +432,7 @@ def create_py_executor( garbage_collection_gen0_threshold, ) + _adjust_torch_mem_fraction(executor_config.pytorch_backend_config) + py_executor.start_worker() return py_executor diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index da014ed54de..291e549c648 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -372,8 +372,6 @@ perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411) test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143) disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5328160) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5328495) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5322354) From 2320f12321fef0e508b25de0b5d4ee3f46e6b374 Mon Sep 17 00:00:00 2001 From: Yechan Kim <161688079+yechank-nvidia@users.noreply.github.com> Date: Tue, 15 Jul 2025 10:26:51 +0900 Subject: [PATCH 03/88] doc: update EXAONE 4.0 news (#6034) Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> --- README.md | 1 + examples/models/core/exaone/README.md | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7c25d94c1e6..99b00e26195 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ TensorRT-LLM ✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) ## Latest News +* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B) * [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai) * [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick ✨ [➡️ link](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/) diff --git a/examples/models/core/exaone/README.md b/examples/models/core/exaone/README.md index cf5be149ddf..51c17e14c02 100644 --- a/examples/models/core/exaone/README.md +++ b/examples/models/core/exaone/README.md @@ -52,16 +52,13 @@ git clone https://huggingface.co/LGAI-EXAONE/EXAONE-Deep-2.4B $HF_MODEL_DIR ### EXAONE-4.0 -Download he HuggingFace checkpoints of EXAONE-4.0 model. Here, we only use the `TODO: replace with REAL name, EXAONE-4.0` model for the example. From EXAONE-4.0 model, we support EXAONE models only on PyTorch flow. +Download he HuggingFace checkpoints of EXAONE-4.0 model. Here, we only use the `EXAONE-4.0-32B` model for the example. From EXAONE-4.0 model, we support only on PyTorch flow. ```bash export HF_MODEL_DIR=hf_models/exaone4 -git clone ... $HF_MODEL_DIR (TODO Change ... to real HF directory) +git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B $HF_MODEL_DIR ``` -## Usage -The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`. - ### Pytorch flow To quickly run EXAONE-4.0 models, you can use [examples/llm-api/quickstart_advanced.py](../../../llm-api/quickstart_advanced.py): @@ -116,6 +113,9 @@ Temporarily switching to `DynamicCache` when creating PTQ models could help addr For models with sliding window attention, DynamicCache is less memory-efficient than HybridCache because it retains the entire key-value cache. However, this does not break the model's attention logic, as the cache implementation is separated from the attention computation itself. This trade-off is acceptable for the PTQ process, which is a one-time procedure. Our tests confirm that this workaround does not degrade accuracy on MMLU or GSM8K benchmarks with the default ModelOpt settings. ### TRT flow + +The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`. + ### Convert checkpoint and build TensorRT engine(s) ```bash From 2ea407799370baf4b2089290a3f1e52b6e4be422 Mon Sep 17 00:00:00 2001 From: Rashid Kaleem <4079439+arekay@users.noreply.github.com> Date: Mon, 14 Jul 2025 20:29:19 -0500 Subject: [PATCH 04/88] [Model load] Fix llama min-latency model load (#5883) Signed-off-by: Rashid Kaleem <4079439+arekay@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_llama.py | 3 ++ .../models/modeling_llama_min_latency.py | 3 ++ tensorrt_llm/_torch/models/modeling_utils.py | 30 +++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index fc3febe8384..1c17eeb5a8e 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -624,6 +624,7 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]): self.num_hidden_layers = config.num_hidden_layers self.aux_stream = torch.cuda.Stream() self.mapping = model_config.mapping + self.preload_weight_modules = [] if self.model_config.mapping.enable_attention_dp: self.embed_tokens = Embedding( @@ -646,6 +647,7 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]): if model_config.enable_min_latency: from .modeling_llama_min_latency import Llama4MinLatencyDecoderLayer DecoderLayerClass = Llama4MinLatencyDecoderLayer + self.preload_weight_modules = ["gate_up_proj"] self.layers = nn.ModuleList([ DecoderLayerClass( @@ -878,6 +880,7 @@ def __init__( model_config.pretrained_config = model_config.pretrained_config.text_config model_config.pretrained_config.architectures = architectures super().__init__(Llama4Model(model_config), model_config) + self.preload_weight_modules = self.model.preload_weight_modules def forward( self, diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py index 88a78cfb136..72a5b4843fb 100644 --- a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py +++ b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py @@ -98,6 +98,9 @@ def load_weights(self, weights: List[Dict]): # After loading weights, calculate the combined scale (input_scale * weight_scale) for special kernels and # trtllm-gen kernels. if self.has_fp8_qdq: + if self.weight_scale.device != self.input_scale.device: + self.weight_scale = torch.nn.Parameter( + self.weight_scale.to(self.input_scale.device)) self.combined_scale = self.input_scale * self.weight_scale # If this is gate_up_proj + swiglu and trtllm-gen kernels will be used, we need to reorder the weights diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py index a8ce31bf2ce..1dac009f5c1 100755 --- a/tensorrt_llm/_torch/models/modeling_utils.py +++ b/tensorrt_llm/_torch/models/modeling_utils.py @@ -525,7 +525,11 @@ def forward( ) def load_weights(self, weights: Dict, skip_modules: List[str] = []): - _load_weights_impl(self, weights, skip_modules) + preload_weight_modules = getattr(self, "preload_weight_modules", None) + _load_weights_impl(self, + weights, + skip_modules, + preload_weight_modules=preload_weight_modules) def infer_max_seq_len(self) -> int: # Modified from tensorrt_llm/builder.py _init_max_seq_len @@ -675,7 +679,10 @@ def run_concurrently(func, def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM], weights: Dict, skip_modules: List[str] = [], - params_map: Optional[Dict[str, str]] = None): + params_map: Optional[Dict[str, str]] = None, + preload_weight_modules: Optional[List[str]] = None): + # TODO: remove preload_weight_modules - it is a workaround for min-latency llama4 model loading where + # we need some order in the module loading. Once this is resolved, we can remove this workaround. if not hasattr(model, 'model_config') or not isinstance( model.model_config, ModelConfig): raise ValueError("model must have a model_config attribute") @@ -756,7 +763,24 @@ def load_single_module(name, module): desc="Loading weights"): load_single_module(name, module) else: + all_modules = dict(model.named_modules()) + serial_load_modules = [] + if preload_weight_modules is not None: + for module in preload_weight_modules: + serial_load_modules.extend([ + name for name in all_modules.keys() if name.endswith(module) + ]) + logger.info(f"Serial load modules: {serial_load_modules}") + pbar = tqdm(serial_load_modules, desc="Loading weights serially") + for module in serial_load_modules: + # logger.info(f"Loading weights for {module} in serial") + load_single_module(module, all_modules[module]) + pbar.update(1) + del all_modules[module] + pbar.close() + pbar = tqdm(list(model.named_modules()), desc="Loading weights concurrently") - args_list = [(name, module) for name, module in model.named_modules()] + args_list = [(name, module) for name, module in model.named_modules() + if name not in serial_load_modules] run_concurrently(load_single_module, args_list, pbar=pbar) From dd2491f47d2022f5f013e467ce62026ef13bcab2 Mon Sep 17 00:00:00 2001 From: Daniel Stokes <40156487+djns99@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:40:42 +1200 Subject: [PATCH 05/88] fix: Fix MOE benchmark to rotate buffers to prevent L2 cache reuse (#4135) Signed-off-by: Daniel Stokes <40156487+djns99@users.noreply.github.com> --- .../mixtureOfExpertsBackendBenchmarkFixture.h | 166 +++++++++++++----- ...ixtureOfExpertsBackendBenchmarkLauncher.cu | 2 +- 2 files changed, 122 insertions(+), 46 deletions(-) diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h index 3cd87acf251..0790b842d45 100644 --- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h +++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h @@ -76,6 +76,7 @@ namespace // Abstract class for routing config struct RoutingConfig { + virtual void start(){}; virtual void setRouting(int* selected_experts, int64_t num_experts, int64_t k, int64_t num_tokens) = 0; virtual std::string getName() = 0; virtual bool isDeterministic() const = 0; @@ -143,6 +144,11 @@ struct RandomDistributionRoutingConfig : public RoutingConfig "Cannot create random routing distribution. Number of experts does not match the number of weights"); } + void start() + { + twister.seed(0xD5); + } + std::string getName() override { return name; @@ -208,6 +214,11 @@ struct UniformRoutingConfig : public RoutingConfig { std::mt19937_64 twister{0xD5}; + void start() + { + twister.seed(0xD5); + } + std::string getName() override { return "uniform"; @@ -522,14 +533,32 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture ActivationType mActType = ActivationType::Relu; - QuantParams mQuantParams{}; + constexpr static int64_t NUM_BUFFERS = 32; + + std::array mQuantParams{}; bool mUseLora = false; bool mUsePrequantScale = false; int mGroupSize = -1; - LoraParams mLoraParams{}; + std::array mLoraParams{}; std::optional mSelectedConfig = std::nullopt; + int64_t mBufferIndex = 0; + size_t mWorkspaceSize = 0; + size_t mExpertWeight1Size = 0; + size_t mExpertWeight2Size = 0; + size_t mExpertBias1Size = 0; + size_t mExpertBias2Size = 0; + size_t mInputTensorSize = 0; + size_t mFinalOutputSize = 0; + size_t mSourceToExpandedMapSize = 0; + size_t mScaleProbsSize = 0; + size_t mSelectedExpertsSize = 0; + size_t mExpertFP4WeightSf1Size = 0; + size_t mExpertFP4WeightSf2Size = 0; + size_t mExpertIntScale1Size = 0; + size_t mExpertIntScale2Size = 0; + template T* allocBuffer(size_t size) { @@ -558,30 +587,39 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture mGatedMultiplier = mIsGated ? 2 : 1; auto const gated_inter = mInterSize * mGatedMultiplier; - size_t workspace_size - = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {}, - mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale); + mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, + {}, mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale); - mWorkspace = allocBuffer(workspace_size); + mWorkspace = allocBuffer(mWorkspaceSize * NUM_BUFFERS); size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize; - mExpertWeight1 = allocBuffer(expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE); - mExpertWeight2 = allocBuffer(expert_matrix_size / WEIGHT_ELEM_PER_BYTE); + mExpertWeight1Size = expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE; + mExpertWeight2Size = expert_matrix_size / WEIGHT_ELEM_PER_BYTE; + mExpertWeight1 = allocBuffer(mExpertWeight1Size * NUM_BUFFERS); + mExpertWeight2 = allocBuffer(mExpertWeight2Size * NUM_BUFFERS); mExpertBias1 = nullptr; mExpertBias2 = nullptr; if (mUseBias) { - mExpertBias1 = allocBuffer(mNumExperts * gated_inter); - mExpertBias2 = allocBuffer(mNumExperts * mHiddenSize); + mExpertBias1Size = mNumExperts * gated_inter; + mExpertBias2Size = mNumExperts * mHiddenSize; + mExpertBias1 = allocBuffer(mExpertBias1Size * NUM_BUFFERS); + mExpertBias2 = allocBuffer(mExpertBias2Size * NUM_BUFFERS); } if constexpr (INT_QUANT) { - mExpertIntScale1 = allocBuffer(mNumExperts * gated_inter); - mExpertIntScale2 = allocBuffer(mNumExperts * mHiddenSize); + mExpertIntScale1Size = mNumExperts * gated_inter; + mExpertIntScale2Size = mNumExperts * mHiddenSize; + mExpertIntScale1 = allocBuffer(mExpertIntScale1Size * NUM_BUFFERS); + mExpertIntScale2 = allocBuffer(mExpertIntScale2Size * NUM_BUFFERS); - mQuantParams = QuantParams::Int(mExpertIntScale1, mExpertIntScale2); + for (int i = 0; i < NUM_BUFFERS; i++) + { + mQuantParams[i] = QuantParams::Int( + mExpertIntScale1 + mExpertIntScale1Size * i, mExpertIntScale2 + mExpertIntScale2Size * i); + } } else if constexpr (FP8) { @@ -589,39 +627,57 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture mExpertFP8Scale2 = allocBuffer(1); mExpertFP8Scale3 = allocBuffer(mNumExperts); - mQuantParams = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3); + for (int i = 0; i < NUM_BUFFERS; i++) + { + mQuantParams[i] = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3); + } } else if constexpr (ANY_FP4) { mExpertFP4ActScale1 = allocBuffer(1); - mExpertFP4WeightSf1 = allocBuffer(num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE); + mExpertFP4WeightSf1Size = num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE; + mExpertFP4WeightSf1 = allocBuffer(mExpertFP4WeightSf1Size * NUM_BUFFERS); mExpertFP4GlobalScale1 = allocBuffer(num_experts); mExpertFP4ActScale2 = allocBuffer(1); - mExpertFP4WeightSf2 = allocBuffer(num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE); + mExpertFP4WeightSf2Size = num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE; + mExpertFP4WeightSf2 = allocBuffer(mExpertFP4WeightSf2Size * NUM_BUFFERS); mExpertFP4GlobalScale2 = allocBuffer(num_experts); auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4; - mQuantParams = func(mExpertFP4ActScale1, mExpertFP4WeightSf1, mExpertFP4GlobalScale1, mExpertFP4ActScale2, - mExpertFP4WeightSf2, mExpertFP4GlobalScale2, false, false); + for (int i = 0; i < NUM_BUFFERS; i++) + { + mQuantParams[i] = func(mExpertFP4ActScale1, mExpertFP4WeightSf1 + mExpertFP4WeightSf1Size * i, + mExpertFP4GlobalScale1, mExpertFP4ActScale2, mExpertFP4WeightSf2 + mExpertFP4WeightSf2Size * i, + mExpertFP4GlobalScale2, false, false); + } } - mSelectedExperts = allocBuffer(mTotalTokens * mK); - mScaleProbs = allocBuffer(mTotalTokens * mK); - mInputTensor = allocBuffer(mTotalTokens * mHiddenSize); - mFinalOutput = allocBuffer(mTotalTokens * mHiddenSize); + mSelectedExpertsSize = mTotalTokens * mK; + mSelectedExperts = allocBuffer(mSelectedExpertsSize * NUM_BUFFERS); + mScaleProbsSize = mTotalTokens * mK; + mScaleProbs = allocBuffer(mScaleProbsSize * NUM_BUFFERS); + mInputTensorSize = mTotalTokens * mHiddenSize; + mInputTensor = allocBuffer(mInputTensorSize * NUM_BUFFERS); + mFinalOutputSize = mTotalTokens * mHiddenSize; + mFinalOutput = allocBuffer(mFinalOutputSize * NUM_BUFFERS); - mSourceToExpandedMap = allocBuffer(mTotalTokens * mK); + mSourceToExpandedMapSize = mTotalTokens * mK; + mSourceToExpandedMap = allocBuffer(mSourceToExpandedMapSize * NUM_BUFFERS); mRoutingConfigIndex = routing_config; auto tactic = routingConfigCache.at(routing_config); - tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens); + tactic->start(); + for (int i = 0; i < NUM_BUFFERS; i++) + { + tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * i, mNumExperts, mK, mTotalTokens); + } check_cuda_error(cudaStreamSynchronize(streamPtr->get())); } - cudaGraph_t mGraph{}; - cudaGraphExec_t mGraphInstance{}; + std::array mGraph{}; + std::array mGraphInstance{}; void createGraph(MOEParallelismConfig parallelism_config) { @@ -630,11 +686,15 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture NVTX3_SCOPED_RANGE(BuildGraph); - check_cuda_error(cudaGraphCreate(&mGraph, 0)); - check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal)); - runMoEPermute(parallelism_config); - check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph)); - check_cuda_error(cudaGraphInstantiate(&mGraphInstance, mGraph, nullptr, nullptr, 0)); + for (int i = 0; i < NUM_BUFFERS; i++) + { + mBufferIndex = i; + check_cuda_error(cudaGraphCreate(&mGraph[i], 0)); + check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal)); + runMoEPermute(parallelism_config); + check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph[i])); + check_cuda_error(cudaGraphInstantiate(&mGraphInstance[i], mGraph[i], nullptr, nullptr, 0)); + } } void destroyGraph() @@ -644,16 +704,20 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture NVTX3_SCOPED_RANGE(DestroyGraph); - check_cuda_error(cudaGraphExecDestroy(mGraphInstance)); - check_cuda_error(cudaGraphDestroy(mGraph)); + for (int i = 0; i < NUM_BUFFERS; i++) + { + check_cuda_error(cudaGraphExecDestroy(mGraphInstance[i])); + check_cuda_error(cudaGraphDestroy(mGraph[i])); + } } float benchmarkLoop(MOEParallelismConfig parallelism_config) { + mBufferIndex = (mBufferIndex + 1) % NUM_BUFFERS; auto tactic = routingConfigCache.at(mRoutingConfigIndex); if (!tactic->isDeterministic()) { - tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens); + tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mNumExperts, mK, mTotalTokens); } { @@ -661,7 +725,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture check_cuda_error(cudaEventRecord(mStartEvent, streamPtr->get())); if (useCudaGraph) { - cudaGraphLaunch(mGraphInstance, streamPtr->get()); + cudaGraphLaunch(mGraphInstance[mBufferIndex], streamPtr->get()); } else { @@ -802,17 +866,29 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture auto stream = streamPtr->get(); MoeMinLatencyParams min_latency_params; #ifdef USING_OSS_CUTLASS_MOE_GEMM - mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr, - mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens, - mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap, - parallelism_config, /*enable_alltoall=*/false, mUseLora, mLoraParams, - /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, min_latency_params, stream); + mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr, + mSelectedExperts + mSelectedExpertsSize * mBufferIndex, + mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr, + mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex, + mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex, + mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize, + mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex, + mFinalOutput + mFinalOutputSize * mBufferIndex, + mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, + /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex], + /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream); #else - mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr, - mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens, - mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap, - parallelism_config, mUseLora, mLoraParams, /*use_deepseek_fp8_block_scale=*/false, - /*min_latency_mode=*/false, min_latency_params, stream); + mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr, + mSelectedExperts + mSelectedExpertsSize * mBufferIndex, + mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr, + mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex, + mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex, + mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize, + mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex, + mFinalOutput + mFinalOutputSize * mBufferIndex, + mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, mUseLora, + mLoraParams[mBufferIndex], + /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream); #endif } diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu index b71509fdbc1..663759e3ff7 100644 --- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu +++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu @@ -623,7 +623,7 @@ void help() " \"dtypes\": [string, ...], (optional)\n" " \"routing_name\": string, (optional)\n" " \"selected_experts\": [int, ...], or string, (optional, length is a multiple of k)\n" - " \"expert_distribtuion\": [float, ...], or string, (optional, length is num_experts)\n" + " \"expert_distribution\": [float, ...], or string, (optional, length is num_experts)\n" " },\n" " ...\n" "]\n" From 24dfd4cd0bf49e3917b8ac69c381391a989aec6c Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Mon, 14 Jul 2025 19:37:26 -0700 Subject: [PATCH 06/88] Doc: Update llama-3.3-70B guide (#6028) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> --- examples/models/core/llama/README.md | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md index cdf660035c2..18f950ac4b4 100644 --- a/examples/models/core/llama/README.md +++ b/examples/models/core/llama/README.md @@ -37,6 +37,10 @@ This document shows how to build and run a LLaMA model in TensorRT-LLM on both s - [Convert Checkpoint to TensorRT-LLM Unified Checkpoint](#convert-checkpoint-to-tensorrt-llm-unified-checkpoint) - [Build Engine](#build-engine) - [Run Inference](#run-inference) + - [Run LLaMa-3.3 70B Model on PyTorch Backend](#run-llama-33-70b-model-on-pytorch-backend) + - [Prepare TensorRT-LLM extra configs](#prepare-tensorrt-llm-extra-configs) + - [Launch trtllm-serve OpenAI-compatible API server](#launch-trtllm-serve-openai-compatible-api-server) + - [Run performance benchmarks](#run-performance-benchmarks) ## Overview @@ -1542,3 +1546,51 @@ bash -c 'python ./examples/mmlu.py --test_trt_llm \ --kv_cache_free_gpu_memory_fraction 0.999 \ --max_tokens_in_paged_kv_cache 65064' ``` + +## Run LLaMa-3.3 70B Model on PyTorch Backend +This section provides the steps to run LLaMa-3.3 70B model FP8 precision on PyTorch backend by launching TensorRT-LLM server and run performance benchmarks. + + +### Prepare TensorRT-LLM extra configs +```bash +cat >./extra-llm-api-config.yml < Date: Tue, 15 Jul 2025 11:06:03 +0800 Subject: [PATCH 07/88] infra: [TRTLLM-6331] Support show all stage name list when stage name check failed (#5946) Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> --- jenkins/L0_Test.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 7dfac2415b5..26c52689766 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1642,7 +1642,8 @@ def checkStageNameSet(stageNames, jobKeys, paramName) { echo "Validate stage names for the passed GitLab bot params [${paramName}]." invalidStageName = stageNames.findAll { !(it in jobKeys) } if (invalidStageName) { - throw new Exception("Cannot find the stage names [${invalidStageName}] from the passed params [${paramName}].") + def sortedJobKeys = jobKeys.sort() + throw new Exception("Cannot find the stage names [${invalidStageName}] from the passed params [${paramName}]. Available stage names (${sortedJobKeys.size()} total):\n${sortedJobKeys.collect { " ${it}" }.join('\n')}") } } From 6b35afaf1bc3983e3cf871bf76c31e89a3470866 Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Tue, 15 Jul 2025 11:27:21 +0800 Subject: [PATCH 08/88] [Infra][TRTLLM-6013] - Fix stage name in single stage test rerun report (#5672) Signed-off-by: Yiqing Yan Co-authored-by: Yanchao Lu --- jenkins/L0_MergeRequest.groovy | 2 +- jenkins/L0_Test.groovy | 11 +++++++---- {tests/integration/defs => jenkins}/test_rerun.py | 0 3 files changed, 8 insertions(+), 5 deletions(-) rename {tests/integration/defs => jenkins}/test_rerun.py (100%) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 6773d1c3ef5..ecfdac3a8dc 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -771,7 +771,7 @@ def collectTestResults(pipeline, testFilter) trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add py3-pip") trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true") sh """ - python3 llm/tests/integration/defs/test_rerun.py \ + python3 llm/jenkins/test_rerun.py \ generate_rerun_report \ --output-file=rerun/rerun_report.xml \ --input-files=${inputfiles} diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 26c52689766..548846612f7 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1111,7 +1111,7 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) { // Generate rerun test lists def failSignaturesList = trtllm_utils.getFailSignaturesList().join(",") sh """ - python3 ${llmSrc}/tests/integration/defs/test_rerun.py \ + python3 ${llmSrc}/jenkins/test_rerun.py \ generate_rerun_tests_list \ --output-dir=${WORKSPACE}/${stageName}/ \ --input-file=${WORKSPACE}/${stageName}/results.xml \ @@ -1184,12 +1184,15 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) { } } - // generate rerun report + // Specify the stage name correctly + sh "cd ${WORKSPACE}/${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true" + + // Generate rerun report inputFiles = ["${WORKSPACE}/${stageName}/results.xml", "${WORKSPACE}/${stageName}/rerun_results_1.xml", "${WORKSPACE}/${stageName}/rerun_results_2.xml"] sh """ - python3 ${llmSrc}/tests/integration/defs/test_rerun.py \ + python3 ${llmSrc}/jenkins/test_rerun.py \ generate_rerun_report \ --output-file=${WORKSPACE}/${stageName}/rerun_results.xml \ --input-files=${inputFiles.join(",")} @@ -1197,7 +1200,7 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) { // Update original results xml file with rerun results xml files for junit sh """ - python3 ${llmSrc}/tests/integration/defs/test_rerun.py \ + python3 ${llmSrc}/jenkins/test_rerun.py \ merge_junit_xmls \ --output-file=${WORKSPACE}/${stageName}/results.xml \ --input-files=${inputFiles.join(",")} \ diff --git a/tests/integration/defs/test_rerun.py b/jenkins/test_rerun.py similarity index 100% rename from tests/integration/defs/test_rerun.py rename to jenkins/test_rerun.py From e499f6c44ab32674ae99b4ff31b01a9cb41c6765 Mon Sep 17 00:00:00 2001 From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> Date: Tue, 15 Jul 2025 01:31:35 -0400 Subject: [PATCH 09/88] [Fix] check for ImportError or ModuleNotFoundError for deep_ep_utils (#6026) Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> --- tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py index 178d4d35849..62146d9295f 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py +++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py @@ -12,7 +12,7 @@ try: from tensorrt_llm.deep_ep import Buffer deep_ep_installed = True -except ModuleNotFoundError: +except ImportError: deep_ep_installed = False From d811843a08ae6fcd61b5c1f53d8fd21865862800 Mon Sep 17 00:00:00 2001 From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Date: Tue, 15 Jul 2025 14:39:31 +0800 Subject: [PATCH 10/88] =?UTF-8?q?infra:=20[TRTLLM-6313]=20Fix=20the=20pack?= =?UTF-8?q?age=20sanity=20stage=20'Host=20Node=20Name'=20in=E2=80=A6=20(#5?= =?UTF-8?q?945)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> --- jenkins/L0_Test.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 548846612f7..7dec81f7fde 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2019,6 +2019,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", { echo "###### Prerequisites Start ######" + echoNodeAndGpuInfo(pipeline, toStageName(values[1], key)) // Clean up the pip constraint file from the base NGC PyTorch image. if (values[5] == DLFW_IMAGE) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true") @@ -2064,7 +2065,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) } withEnv(libEnv) { sh "env | sort" - runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, "${values[1]}-${key}-sanity-check" , 1, 1, true, null) + runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, toStageName(values[1], key), 1, 1, true, null) } }) } From 4e4d18826fbd476bb05a09307bc0abe350b5a235 Mon Sep 17 00:00:00 2001 From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Date: Tue, 15 Jul 2025 14:50:03 +0800 Subject: [PATCH 11/88] =?UTF-8?q?chore:=20[Breaking=20Change]=20Rename=20c?= =?UTF-8?q?uda=5Fgraph=5Fconfig=20padding=5Fenabled=20fie=E2=80=A6=20(#600?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> --- ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md | 7 +- ..._R1_MTP_Implementation_and_Optimization.md | 6 +- ..._R1_Throughput_on_NVIDIA_Blackwell_GPUs.md | 2 +- ...ling_Expert_Parallelism_in_TensorRT-LLM.md | 3 +- docs/source/performance/perf-overview.md | 2 +- docs/source/scripts/disaggregated/gen_yaml.py | 6 +- examples/llm-api/llm_runtime.py | 2 +- examples/llm-api/quickstart_advanced.py | 6 +- examples/models/core/deepseek_v3/README.md | 13 ++- examples/models/core/llama/README.md | 4 +- examples/models/core/llama4/README.md | 8 +- examples/models/core/qwen/README.md | 4 +- examples/wide_ep/ep_load_balancer/README.md | 26 +++-- examples/wide_ep/slurm_scripts/gen_yaml.py | 2 +- .../_torch/pyexecutor/model_engine.py | 6 +- tensorrt_llm/bench/benchmark/utils/general.py | 2 +- tensorrt_llm/llmapi/__init__.py | 3 +- tensorrt_llm/llmapi/llm_args.py | 103 ++++++++++-------- .../defs/accuracy/test_llm_api_pytorch.py | 45 ++++---- ..._lite_attention_dp_overlap_cuda_graph.yaml | 2 +- ...2_deepseek_v3_lite_overlap_cuda_graph.yaml | 2 +- .../disagg_config_cuda_graph_padding.yaml | 2 +- .../defs/perf/pytorch_model_config.py | 6 +- .../defs/stress_test/stress_test.py | 2 +- .../_torch/modeling/test_modeling_deepseek.py | 4 +- .../multi_gpu_modeling/test_deepseek.py | 6 +- .../_torch/test_pytorch_model_engine.py | 5 +- .../api_stability/references/llm.yaml | 10 +- tests/unittest/llmapi/test_llm_args.py | 6 +- 29 files changed, 156 insertions(+), 139 deletions(-) diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index f17caefc445..98c72e700d6 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -138,7 +138,8 @@ YOUR_DATA_PATH= cat >./extra-llm-api-config.yml<./extra-llm-api-config.yml <./extra-llm-api-config.yml < cat >./extra-llm-api-config.yml< cat >./extra-llm-api-config.yml< ./extra_llm_api_options_eplb.yaml < /tmp/extra-llm-api-config.yml cuda_graph_config: - padding_enabled: true + enable_padding: true batch_sizes: [1, 4, 8, 12] EOF @@ -169,9 +169,10 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ cat < /tmp/extra-llm-api-config.yml cuda_graph_config: - padding_enabled: true + enable_padding: true batch_sizes: [1, 2] -moe_max_num_tokens: 16384 +moe_config: + max_num_tokens: 16384 EOF trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \ @@ -237,7 +238,7 @@ To serve the model using `trtllm-serve`: ```bash cat >./extra-llm-api-config.yml <./gen-extra-llm-api-config.yml </path/to/TensorRT-LLM/extra-llm-api-config.yml <./extra-llm-api-config.yml <./extra-llm-api-config.yml <./gen-extra-llm-api-config.yml < ./extra_llm_api_options.yaml < ./extra_llm_api_options_eplb.yaml < ./extra_llm_api_options_eplb.yaml < **Note:** Similar to offline EP Load Balancer, you can enable expert ID counting to verify the effectiveness of EPLB, but remember to disable it when running inference for benchmarking or production purposes. -> **Explanation on moe_max_num_tokens:** For Large Scale EP, there can be extreme conditions that all ranks send tokens to a single rank since they all want that expert. +> **Explanation on max_num_tokens of moe_config:** For Large Scale EP, there can be extreme conditions that all ranks send tokens to a single rank since they all want that expert. In that case, that rank will have too many tokens to compute. In order not to make the hot rank OOM, there is one strategy that chunk the tokens if there are too much. -`moe_max_num_tokens` is the parameter that controls the max chunk size. However, this may have performance penalty if there is enough since batch size is smaller. +`max_num_tokens` of moe_config is the parameter that controls the max chunk size. However, this may have performance penalty if there is enough since batch size is smaller. So by default, it is set to some value that all tokens can complete in one wave. However, if EP size is large, we may need to trade off that in order not to OOM or got other runtime errors due to lack of memory. -One good point is that if memory is OK, we can set `moe_max_num_tokens` to `max_batch_size * ep_size` to make all generation requests can be processed in one chunk. -For example, if `ep_size` is 36 and `max_batch_size` is 256, we may set `moe_max_num_tokens` to 9216. +One good point is that if memory is OK, we can set `max_num_tokens` to `max_batch_size * ep_size` to make all generation requests can be processed in one chunk. +For example, if `ep_size` is 36 and `max_batch_size` is 256, we may set `max_num_tokens` to 9216. diff --git a/examples/wide_ep/slurm_scripts/gen_yaml.py b/examples/wide_ep/slurm_scripts/gen_yaml.py index fd1de76b98b..121f614d870 100644 --- a/examples/wide_ep/slurm_scripts/gen_yaml.py +++ b/examples/wide_ep/slurm_scripts/gen_yaml.py @@ -196,7 +196,7 @@ def gen_config_file(config_path: str, 'max_seq_len': 2176, 'free_gpu_memory_fraction': gen_gpu_memory_fraction, 'cuda_graph_config': { - 'padding_enabled': True, + 'enable_padding': True, 'batch_sizes': gen_cuda_graph_batch_sizes, }, 'print_iter_log': True, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 204094646a1..42a0c001076 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -309,7 +309,7 @@ def get_rank_model_storage(model): def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int], max_batch_size: int, max_num_tokens: int, max_draft_len: int, - padding_enabled: bool) -> list[int]: + enable_padding: bool) -> list[int]: # This is the largest possible batch size for a pure decoding batch. max_cuda_graph_bs = min(max_batch_size, int(max_num_tokens / (1 + max_draft_len))) @@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int], # is that if the user is OK padding to a batch size B, they should also # be OK with padding to some size B' < B since the performance will generally # just be better in the smaller case. - if padding_enabled and (i == 0 - or result[i - 1] != max_cuda_graph_bs): + if enable_padding and (i == 0 + or result[i - 1] != max_cuda_graph_bs): logger.warning( "CUDA graph padding is enabled, but one of the given CUDA graph " f"batch sizes ({bs}) is larger than the executor's max batch size " diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py index 0db7d7b21b4..153e262276f 100755 --- a/tensorrt_llm/bench/benchmark/utils/general.py +++ b/tensorrt_llm/bench/benchmark/utils/general.py @@ -152,7 +152,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str, pass cuda_graph_config = { - "padding_enabled": True, + "enable_padding": True, "max_batch_size": max_batch_size } diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py index a912b1c80c8..24f7ad00e75 100644 --- a/tensorrt_llm/llmapi/__init__.py +++ b/tensorrt_llm/llmapi/__init__.py @@ -9,7 +9,7 @@ CudaGraphConfig, DraftTargetDecodingConfig, DynamicBatchConfig, EagleDecodingConfig, ExtendedRuntimePerfKnobConfig, KvCacheConfig, LlmArgs, - LookaheadDecodingConfig, MedusaDecodingConfig, + LookaheadDecodingConfig, MedusaDecodingConfig, MoeConfig, MTPDecodingConfig, NGramDecodingConfig, SchedulerConfig, TorchCompileConfig, TorchLlmArgs, TrtLlmArgs, UserProvidedDecodingConfig) @@ -27,6 +27,7 @@ 'KvCacheConfig', 'KvCacheRetentionConfig', 'CudaGraphConfig', + 'MoeConfig', 'LookaheadDecodingConfig', 'MedusaDecodingConfig', 'EagleDecodingConfig', diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 4ca266d53ed..1b385b6e8fc 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -72,7 +72,7 @@ class CudaGraphConfig(BaseModel): max_batch_size: int = Field( default=0, description="Maximum batch size for CUDA graphs.") - padding_enabled: bool = Field( + enable_padding: bool = Field( default=False, description= "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance." @@ -88,6 +88,30 @@ def validate_cuda_graph_max_batch_size(cls, v): return v +class MoeConfig(BaseModel): + """ + Configuration for MoE. + """ + backend: Literal["CUTLASS", "CUTEDSL", "WIDEEP", "TRTLLM", + "VANILLA"] = Field(default='CUTLASS', + description="MoE backend to use.") + + max_num_tokens: Optional[int] = Field( + default=None, + description= + "If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used." + ) + + load_balancer: Optional[Union[object, str]] = Field( + default=None, + description="Configuration for MoE load balancing.", + json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"}) + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + @dataclass class _ParallelConfig: ''' The model distribution configs for LLM. ''' @@ -1768,26 +1792,12 @@ class TorchLlmArgs(BaseLlmArgs): disable_overlap_scheduler: bool = Field( default=False, description="Disable the overlap scheduler.") - moe_max_num_tokens: Optional[int] = Field( - default=None, - description= - "If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used." - ) - - moe_load_balancer: Optional[Union[object, str]] = Field( - default=None, - description="Configuration for MoE load balancing.", - json_schema_extra={ - "type": - "Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]" - }) + moe_config: MoeConfig = Field(default_factory=MoeConfig, + description="MoE config.") attn_backend: str = Field(default='TRTLLM', description="Attention backend to use.") - moe_backend: str = Field(default='CUTLASS', - description="MoE backend to use.") - enable_mixed_sampler: bool = Field( default=False, description= @@ -1889,25 +1899,6 @@ def extra_resource_managers(self) -> Dict[str, object]: def extra_resource_managers(self, value: Dict[str, object]) -> None: self._extra_resource_managers = value - @model_validator(mode="after") - def validate_moe_load_balancer(self): - from .._torch.model_config import MoeLoadBalancerConfig - if isinstance(self.moe_load_balancer, str): - if not os.path.exists(self.moe_load_balancer): - raise FileNotFoundError( - f"MoE load balancer config file not found: {self.moe_load_balancer}" - ) - try: - with open(self.moe_load_balancer) as f: - moe_load_balancer_config = yaml.safe_load(f) - self.moe_load_balancer = MoeLoadBalancerConfig( - **moe_load_balancer_config) - except Exception as e: - raise ValueError( - f"Failed to load MoE load balancer config file: {self.moe_load_balancer}" - ) from e - return self - @model_validator(mode="after") def validate_stream_interval(self): if self.stream_interval <= 0: @@ -1917,17 +1908,17 @@ def validate_stream_interval(self): @staticmethod def _generate_cuda_graph_batch_sizes(max_batch_size: int, - padding_enabled: bool) -> List[int]: + enable_padding: bool) -> List[int]: """Generate a list of batch sizes for CUDA graphs. Args: max_batch_size: Maximum batch size to generate up to - padding_enabled: Whether padding is enabled, which affects the batch size distribution + enable_padding: Whether padding is enabled, which affects the batch size distribution Returns: List of batch sizes to create CUDA graphs for """ - if padding_enabled: + if enable_padding: batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)] else: batch_sizes = list(range(1, 32)) + [32, 64, 128] @@ -1947,6 +1938,25 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int, return batch_sizes + @model_validator(mode="after") + def validate_load_balancer(self) -> 'TorchLlmArgs': + from .._torch import MoeLoadBalancerConfig + if isinstance(self.moe_config.load_balancer, str): + if not os.path.exists(self.moe_config.load_balancer): + raise FileNotFoundError( + f"MoE load balancer config file not found: {self.moe_config.load_balancer}" + ) + try: + with open(self.moe_config.load_balancer) as f: + moe_load_balancer_config = yaml.safe_load(f) + self.moe_config.load_balancer = MoeLoadBalancerConfig( + **moe_load_balancer_config) + except Exception as e: + raise ValueError( + f"Failed to load MoE load balancer config file: {self.load_balancer}" + ) from e + return self + @model_validator(mode='after') def validate_cuda_graph_config(self) -> 'TorchLlmArgs': """Validate CUDA graph configuration. @@ -1965,7 +1975,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs': config.batch_sizes = sorted(config.batch_sizes) if config.max_batch_size != 0: if config.batch_sizes != self._generate_cuda_graph_batch_sizes( - config.max_batch_size, config.padding_enabled): + config.max_batch_size, config.enable_padding): raise ValueError( "Please don't set both cuda_graph_config.batch_sizes " "and cuda_graph_config.max_batch_size.\n" @@ -1977,7 +1987,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs': else: max_batch_size = config.max_batch_size or 128 generated_sizes = self._generate_cuda_graph_batch_sizes( - max_batch_size, config.padding_enabled) + max_batch_size, config.enable_padding) config.batch_sizes = generated_sizes config.max_batch_size = max_batch_size @@ -1996,14 +2006,14 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig": cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size if self.cuda_graph_config else CudaGraphConfig.model_fields['max_batch_size'].default, - cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled + cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding if self.cuda_graph_config else - CudaGraphConfig.model_fields['padding_enabled'].default, + CudaGraphConfig.model_fields['enable_padding'].default, disable_overlap_scheduler=self.disable_overlap_scheduler, - moe_max_num_tokens=self.moe_max_num_tokens, - moe_load_balancer=self.moe_load_balancer, + moe_max_num_tokens=self.moe_config.max_num_tokens, + moe_load_balancer=self.moe_config.load_balancer, attn_backend=self.attn_backend, - moe_backend=self.moe_backend, + moe_backend=self.moe_config.backend, enable_mixed_sampler=self.enable_mixed_sampler, enable_trtllm_sampler=self.enable_trtllm_sampler, kv_cache_dtype=self.kv_cache_dtype, @@ -2046,6 +2056,7 @@ def update_llm_args_with_extra_dict( "enable_build_cache": BuildCacheConfig, "speculative_config": DecodingBaseConfig, "lora_config": LoraConfig, + "moe_config": MoeConfig, } for field_name, field_type in field_mapping.items(): if field_name in llm_args_dict: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index f0b8bd50b5c..eb4cadc985d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -19,7 +19,7 @@ from tensorrt_llm import LLM from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig from tensorrt_llm.llmapi import (CudaGraphConfig, EagleDecodingConfig, - KvCacheConfig, MTPDecodingConfig, + KvCacheConfig, MoeConfig, MTPDecodingConfig, NGramDecodingConfig, SamplingParams, TorchCompileConfig) from tensorrt_llm.models.modeling_utils import QuantConfig @@ -97,7 +97,7 @@ def test_bfloat16(self, attn_backend, torch_compile): enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -123,7 +123,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend, enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -147,7 +147,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile): enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -185,7 +185,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend, enable_fullgraph=True) if torch_compile else None pytorch_config = dict( torch_compile_config=torch_compile_config, - cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile, + cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile, batch_sizes=[4]), attn_backend=attn_backend, disable_overlap_scheduler=torch_compile, @@ -719,7 +719,7 @@ def test_cute_dsl_fp8_block_scales( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_config=torch_compile_config, - moe_backend="CUTEDSL", + moe_config=MoeConfig(backend="CUTEDSL"), ) quant_config = QuantConfig() @@ -759,7 +759,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn): disable_overlap_scheduler=False, cuda_graph_config=CudaGraphConfig( max_batch_size=512, - padding_enabled=True, + enable_padding=True, ), ) with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", @@ -782,7 +782,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn, mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) pytorch_config = dict( disable_overlap_scheduler=False, - cuda_graph_config=CudaGraphConfig(padding_enabled=True), + cuda_graph_config=CudaGraphConfig(enable_padding=True), ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES @@ -899,7 +899,7 @@ def test_cute_dsl_fp8_block_scales_4gpus( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_config=torch_compile_config, - moe_backend="CUTEDSL", + moe_config=MoeConfig(backend="CUTEDSL"), ) quant_config = QuantConfig() @@ -948,8 +948,9 @@ def test_fp8_block_scales_4gpus_static_eplb(self): initial_global_assignments=initial_global_assignments, layer_updates_per_iter=0) pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(), - moe_backend="WIDEEP", - moe_load_balancer=eplb_config) + moe_config=MoeConfig( + backend="WIDEEP", + load_balancer=eplb_config)) with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", tensor_parallel_size=4, moe_expert_parallel_size=4, @@ -968,8 +969,8 @@ def test_bfloat16_4gpus_online_eplb(self, mtp_nextn): eplb_config = MoeLoadBalancerConfig(num_slots=num_slots, layer_updates_per_iter=2) pytorch_config = dict(cuda_graph_config=CudaGraphConfig(), - moe_backend="WIDEEP", - moe_load_balancer=eplb_config) + moe_config=MoeConfig(backend="WIDEEP", + load_balancer=eplb_config)) mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -992,8 +993,9 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv): eplb_config = MoeLoadBalancerConfig(num_slots=num_slots, layer_updates_per_iter=2) pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(), - moe_backend="WIDEEP", - moe_load_balancer=eplb_config) + moe_config=MoeConfig( + backend="WIDEEP", + load_balancer=eplb_config)) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.NVFP4 if fp8kv: @@ -1035,8 +1037,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, torch_compile_config=torch_compile_config, - moe_backend=moe_backend, - ) + moe_config=MoeConfig(backend=moe_backend)) mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -1095,7 +1096,7 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, torch_compile_config=torch_compile_config, - moe_backend=moe_backend, + moe_config=MoeConfig(backend=moe_backend), ) mtp_config = None @@ -1331,7 +1332,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, - moe_backend=moe_backend) + moe_config=MoeConfig(backend=moe_backend)) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.NVFP4 @@ -1726,7 +1727,7 @@ def test_nvfp4( pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, - moe_backend=moe_backend, + moe_config=MoeConfig(backend=moe_backend), ) with LLM( @@ -1808,7 +1809,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, - moe_backend=moe_backend) + moe_config=MoeConfig(backend=moe_backend)) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) with LLM( @@ -1854,7 +1855,7 @@ class TestKanana_Instruct(LlmapiAccuracyTestHarness): def test_auto_dtype(self): "RCCA: https://nvbugspro.nvidia.com/bug/5310520" pytorch_config = dict(cuda_graph_config=CudaGraphConfig( - padding_enabled=True, max_batch_size=384)) + enable_padding=True, max_batch_size=384)) with LLM(self.MODEL_PATH, **pytorch_config, enable_attention_dp=True) as llm: task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml index 6135aefa0a7..1171fb4f102 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml @@ -17,7 +17,7 @@ generation_servers: pipeline_parallel_size: 1 enable_attention_dp: true cuda_graph_config: - padding_enabled: False + enable_padding: False disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml index e4880434eb0..18acc70f9ac 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml @@ -15,7 +15,7 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 cuda_graph_config: - padding_enabled: False + enable_padding: False disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml index 8f1ff654b38..7009df9fd0f 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml @@ -28,7 +28,7 @@ generation_servers: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False cuda_graph_config: - padding_enabled: True + enable_padding: True batch_sizes: [1,4,8,16,24,32] disable_overlap_scheduler: True urls: diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index ea256eee7bb..d2ef5f8b536 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -30,7 +30,7 @@ def get_model_yaml_config(model_label: str, base_config = { 'print_iter_log': True, 'cuda_graph_config': { - 'padding_enabled': True, + 'enable_padding': True, }, } if 'kv_cache_dtype' in model_label: @@ -66,7 +66,7 @@ def get_model_yaml_config(model_label: str, 'config': { 'enable_attention_dp': True, 'cuda_graph_config': { - 'padding_enabled': True, + 'enable_padding': True, 'batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] } } @@ -89,7 +89,7 @@ def get_model_yaml_config(model_label: str, 'config': { 'print_iter_log': True, 'cuda_graph_config': { - 'padding_enabled': True, + 'enable_padding': True, 'batch_sizes': [1, 512, 1024, 2048] } } diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index bfa6abd0177..f0f85fe51e3 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -519,7 +519,7 @@ def stress_test(config, if config.backend == "pytorch": extra_llm_options.update({ "cuda_graph_config": { - "padding_enabled": True, + "enable_padding": True, "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], }, "print_iter_log": True, diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py index 660e09393f5..e5cf9680bbf 100644 --- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py +++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py @@ -8,7 +8,7 @@ from utils.util import getSMVersion from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig +from tensorrt_llm.llmapi import KvCacheConfig, MoeConfig, MTPDecodingConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory @@ -71,7 +71,7 @@ def test_deepseek_trtllmgen(model_name): kv_cache_dtype="auto", attn_backend="TRTLLM", load_format="dummy", - moe_backend="TRTLLM", + moe_config=MoeConfig(backend="TRTLLM"), ) model_dir = str(llm_models_root() / Path(f"DeepSeek-R1/{model_name}")) diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index eccdaaec988..5d2a8b71374 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -8,7 +8,7 @@ from utils.util import getSMVersion from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm.llmapi import KvCacheConfig +from tensorrt_llm.llmapi import KvCacheConfig, MoeConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory @@ -65,9 +65,8 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): disable_overlap_scheduler=True, kv_cache_dtype="auto", attn_backend=backend, - moe_max_num_tokens=moe_max_num_tokens, ) - + moe_config = MoeConfig(max_num_tokens=moe_max_num_tokens) model_dir = str(llm_models_root() / model_name / model_path[quant]) assert Path(model_dir).exists() @@ -76,6 +75,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): tensor_parallel_size=tp_size, enable_chunked_prefill=False, **pytorch_config, + moe_config=moe_config, moe_expert_parallel_size=-1, moe_tensor_parallel_size=-1, enable_attention_dp=enable_attention_dp, diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/test_pytorch_model_engine.py index 5a7c43bb5e7..30c05a67aa0 100644 --- a/tests/unittest/_torch/test_pytorch_model_engine.py +++ b/tests/unittest/_torch/test_pytorch_model_engine.py @@ -307,8 +307,7 @@ def test_cuda_graph_enable(self): "CUDA graphs should be disabled when cuda_graph_config=None") # Test 4: Custom CudaGraphConfig with specific settings - custom_config = CudaGraphConfig(max_batch_size=256, - padding_enabled=True) + custom_config = CudaGraphConfig(max_batch_size=256, enable_padding=True) llm_args_custom = LlmArgs.from_kwargs(model="dummy_model", cuda_graph_config=custom_config) pytorch_config_custom = llm_args_custom.get_pytorch_backend_config() @@ -317,7 +316,7 @@ def test_cuda_graph_enable(self): self.assertEqual(pytorch_config_custom.cuda_graph_max_batch_size, 256, "Custom max_batch_size should be respected") self.assertTrue(pytorch_config_custom.cuda_graph_padding_enabled, - "Custom padding_enabled should be respected") + "Custom enable_padding should be respected") if __name__ == "__main__": diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index e0d3bf5216c..132bdee5804 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -69,18 +69,12 @@ methods: disable_overlap_scheduler: annotation: bool default: False - moe_max_num_tokens: - annotation: Optional[int] - default: null - moe_load_balancer: - annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None] + moe_config: + annotation: tensorrt_llm.llmapi.llm_args.MoeConfig default: null attn_backend: annotation: str default: TRTLLM - moe_backend: - annotation: str - default: CUTLASS enable_mixed_sampler: annotation: bool default: False diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index b2eb9e8d8cd..0c2aaf20a13 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -272,7 +272,7 @@ def test_cuda_graph_batch_sizes_case_0_1(self): cuda_graph_config=CudaGraphConfig( batch_sizes=CudaGraphConfig._generate_cuda_graph_batch_sizes( 128, True), - padding_enabled=True, + enable_padding=True, max_batch_size=128)) assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes( 128, True) @@ -282,14 +282,14 @@ def test_cuda_graph_batch_sizes_case_1(self): # set cuda_graph_batch_sizes only args = TorchLlmArgs(model=llama_model_path, cuda_graph_config=CudaGraphConfig( - batch_sizes=[1, 2, 4], padding_enabled=True)) + batch_sizes=[1, 2, 4], enable_padding=True)) assert args.cuda_graph_config.batch_sizes == [1, 2, 4] def test_cuda_graph_batch_sizes_case_2(self): # set cuda_graph_config.max_batch_size only args = TorchLlmArgs(model=llama_model_path, cuda_graph_config=CudaGraphConfig( - max_batch_size=128, padding_enabled=True)) + max_batch_size=128, enable_padding=True)) assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes( 128, True) assert args.cuda_graph_config.max_batch_size == 128 From 2504aa552efdd85dc486cf23e78611b01f1dc8db Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Tue, 15 Jul 2025 15:53:15 +0800 Subject: [PATCH 12/88] test: add recursive updating pytorch config and change MOE backend format in perf test (#6046) Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com> --- .../integration/defs/perf/pytorch_model_config.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index d2ef5f8b536..40b2b9f9682 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -18,6 +18,15 @@ """ +def recursive_update(d, u): + for k, v in u.items(): + if isinstance(v, dict) and isinstance(d.get(k), dict): + recursive_update(d[k], v) + else: + d[k] = v + return d + + def get_model_yaml_config(model_label: str, lora_dirs: list[str] = None) -> dict: """ @@ -130,7 +139,9 @@ def get_model_yaml_config(model_label: str, ], 'config': { 'enable_attention_dp': False, - 'moe_backend': 'TRTLLM' + 'moe_config': { + 'backend': 'TRTLLM' + } } } ] @@ -142,7 +153,7 @@ def get_model_yaml_config(model_label: str, patterns = [patterns] for pattern in patterns: if pattern in model_label.lower(): - base_config.update(pattern_config['config']) + recursive_update(base_config, pattern_config['config']) break # Stop checking other patterns for this config once we find a match # lora-specific change for pytorch From 2a147c4d01d0b73ee73c193c0d709f8d4c29f462 Mon Sep 17 00:00:00 2001 From: ruodil <200874449+ruodil@users.noreply.github.com> Date: Tue, 15 Jul 2025 15:53:59 +0800 Subject: [PATCH 13/88] test: add llama_v3.3_70b_cases in perf test (#6035) Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com> --- .../defs/perf/pytorch_model_config.py | 20 +++++++++++++++++++ tests/integration/defs/perf/test_perf.py | 1 + .../qa/trt_llm_release_perf_sanity_test.yml | 2 ++ .../qa/trt_llm_release_perf_test.yml | 7 +++++++ 4 files changed, 30 insertions(+) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 40b2b9f9682..4c0ef184093 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -143,6 +143,26 @@ def get_model_yaml_config(model_label: str, 'backend': 'TRTLLM' } } + }, + # Llama-v3.3 models with fp8 quantization + { + 'patterns': [ + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,1000-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4', + 'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4', + ], + 'config': { + 'use_cuda_graph': + True, + 'cuda_graph_padding_enabled': + True, + 'cuda_graph_batch_sizes': [ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, + 4096, 8192 + ] + } } ] diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index eb8af3d593e..759ff9273f8 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -54,6 +54,7 @@ "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", "llama_v3.3_70b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4", + "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", "llama_v3.1_405b_instruct_fp4": "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4", "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct", diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml index f5013e1b5b1..e7369bac1cd 100644 --- a/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml +++ b/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml @@ -202,6 +202,8 @@ trt_llm_release_perf_sanity_test: - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8] - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:8] diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml index 6c9f6bcb261..1b3b539fd3e 100644 --- a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml +++ b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml @@ -295,6 +295,11 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4] - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4] # FP8 specific tests - condition: @@ -357,6 +362,8 @@ trt_llm_release_perf_test: - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8] - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8] - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8] From 9e871ca582e7867686398e3a9377f2e029944020 Mon Sep 17 00:00:00 2001 From: Yiteng Niu <6831097+niukuo@users.noreply.github.com> Date: Tue, 15 Jul 2025 17:18:38 +0800 Subject: [PATCH 14/88] [infra] add more log on reuse-uploading (#6036) Signed-off-by: Yiteng Niu <6831097+niukuo@users.noreply.github.com> Co-authored-by: Yanchao Lu --- jenkins/Build.groovy | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 51cd760425a..81193d2ddd5 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -306,18 +306,19 @@ def uploadArtifacts(artifacts, prefix = UPLOAD_PATH, retryTimes = 2, serverId = for (it in artifacts) { def uploadpath = it.key def filepath = it.value - echo "uploading ${filepath} as ${uploadpath}" - trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", { - rtUpload ( - serverId: serverId, - spec: """{ + def spec = """{ "files": [ { "pattern": "${filepath}", "target": "${prefix}/${uploadpath}" } ] - }""", + }""" + echo "Uploading ${filepath} as ${uploadpath}. Spec: ${spec}" + trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", { + rtUpload ( + serverId: serverId, + spec: spec, ) }) } From ab1c54709d42452bfe194517745094f36dbc751b Mon Sep 17 00:00:00 2001 From: Jaedeok Kim <110799725+jaedeok-nvidia@users.noreply.github.com> Date: Tue, 15 Jul 2025 18:41:54 +0900 Subject: [PATCH 15/88] fix: adjust window sizes of VSWA at torch backend (#5880) Signed-off-by: Jaedeok Kim --- .../_torch/pyexecutor/resource_manager.py | 132 ++++++++++++++++-- tensorrt_llm/_utils.py | 16 +++ .../unittest/_torch/test_resource_manager.py | 116 ++++++++++++++- 3 files changed, 251 insertions(+), 13 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index e52096727c6..ffa8ce4bdae 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -1,8 +1,9 @@ +import copy import enum import math from abc import ABC, abstractmethod from collections import OrderedDict, defaultdict -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Set, Tuple, Union import torch @@ -11,7 +12,7 @@ from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE from tensorrt_llm.sampling_params import SamplingParams -from ..._utils import nvtx_range +from ..._utils import binding_dtype_size, nvtx_range from ...logger import logger from ...mapping import Mapping from .llm_request import LlmRequest, LlmRequestState, SamplingConfig @@ -437,14 +438,10 @@ def calculate_max_num_blocks(self, cache_size_per_token = kv_factor * sum( self.num_kv_heads_per_layer) * head_dim - if dtype == DataType.FP8: - kv_cache_dtype_bytes = 1 - elif dtype in (DataType.HALF, DataType.BF16): - kv_cache_dtype_bytes = 2 - elif dtype == DataType.FLOAT: - kv_cache_dtype_bytes = 4 - else: + if dtype not in (DataType.FP8, DataType.HALF, DataType.BF16, + DataType.FLOAT): raise ValueError(f'Cannot support {dtype} KV cache.') + kv_cache_dtype_bytes = binding_dtype_size(dtype) cache_size_bytes_per_token = cache_size_per_token * kv_cache_dtype_bytes free_mem, total_mem = torch.cuda.mem_get_info() @@ -603,6 +600,102 @@ def _get_window_size_to_layers(self) -> dict[int, list[int]]: window_size_to_layers_map[window_size].append(local_layer_idx) return window_size_to_layers_map + @staticmethod + def adjust_window_sizes_for_vswa( + window_size_to_layers: Dict[int, List[int]], + kv_cache_config: KvCacheConfigCpp, + model_config: ModelConfig, + pool_memory_bytes: int, + kv_factor: int, + dtype: DataType, + is_cross_attention: bool = False, + ) -> Dict[int, List[int]]: + + assert is_cross_attention is False, 'Cross attention is not supported' + + max_tokens_from_config = kv_cache_config.max_tokens + + def calculate_cache_size_per_token(layers: Set[int]) -> int: + # Same as BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize + total_kv_heads = sum(model_config.num_kv_heads_per_layer[i] + for i in layers) + return total_kv_heads * kv_factor * model_config.head_size + + # Calculate the required memory bytes per sequence. + required_mem_bytes_per_seq = 0 + for window_size in sorted(window_size_to_layers): + layers = window_size_to_layers[window_size] + cache_size_per_token = calculate_cache_size_per_token(layers) + cache_size_bytes_per_token = cache_size_per_token * binding_dtype_size( + dtype) + required_mem_bytes_per_seq += window_size * cache_size_bytes_per_token + logger.debug( + f'Required memory per sequence: {required_mem_bytes_per_seq} bytes') + + if required_mem_bytes_per_seq < pool_memory_bytes: + # No need to adjust the window sizes. + return copy.deepcopy(window_size_to_layers) + + logger.debug( + f'Adjusting the window sizes {list(window_size_to_layers)} to fit ' + f'the memory {pool_memory_bytes} bytes.') + adjusted_window_size_to_layers = {} + + remaining_mem_bytes = pool_memory_bytes + remaining_layers = set(i for layers in window_size_to_layers.values() + for i in layers) + + accum_max_tokens = 0 + prev_window_size = 0 + + for window_size in sorted(window_size_to_layers): + layers = window_size_to_layers[window_size] + if remaining_mem_bytes > 0 and remaining_layers: + # Calculate cache size per token for remaining layers only + cache_size_per_token = calculate_cache_size_per_token( + remaining_layers) + cache_size_bytes_per_token = cache_size_per_token * binding_dtype_size( + dtype) + logger.debug( + f'Cache size per token for {len(remaining_layers)} layers: ' + f'{cache_size_bytes_per_token} bytes') + # Calculate max tokens that can fit in this window with remaining memory. + max_tokens_in_window = min( + remaining_mem_bytes // cache_size_bytes_per_token, + window_size - prev_window_size) + remaining_mem_bytes -= max_tokens_in_window * cache_size_bytes_per_token + accum_max_tokens += max_tokens_in_window + logger.debug(f'Remaining memory: {remaining_mem_bytes} bytes') + logger.debug( + f'Max token of window {window_size}: {accum_max_tokens}') + + if accum_max_tokens < window_size: + logger.debug( + f'Max tokens ({accum_max_tokens}) cannot fill the current window ({window_size}). ' + f'The larger windows will have the same max tokens.') + remaining_mem_bytes = 0 + + # Clamp the sequence length if provided explicitly. + if max_tokens_from_config is not None: + accum_max_tokens = min(max_tokens_from_config, + accum_max_tokens) + # If max tokens from config is reached, stop allocating + # more memory. Since the maximum number of tokens is + # already reached, for the remaining windows maxTokens + # will be set by the current value of accumMaxTokens. + if accum_max_tokens == max_tokens_from_config: + remaining_mem_bytes = 0 + + if accum_max_tokens not in adjusted_window_size_to_layers: + adjusted_window_size_to_layers[accum_max_tokens] = layers.copy() + else: + adjusted_window_size_to_layers[accum_max_tokens].extend(layers) + + remaining_layers -= set(layers) + prev_window_size = window_size + + return adjusted_window_size_to_layers + def calculate_max_num_blocks_from_cpp( self, kv_cache_config: KvCacheConfigCpp, @@ -622,6 +715,9 @@ def calculate_max_num_blocks_from_cpp( A dict of (max_attention_window, (blocks_in_primary_pool, blocks_in_secondary_pool)). """ + # VSWA on Torch backend has not supported the cross attention. + is_cross_attention = False + # Construct WorldConfig from self.mapping world_config_cpp = WorldConfig( tensor_parallelism=self.mapping.tp_size, @@ -636,12 +732,26 @@ def calculate_max_num_blocks_from_cpp( primary_pool_memory_bytes = free_mem secondary_pool_memory_bytes = 0 logger.debug( - f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \nsecondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB" + f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \n" + f"secondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB" + ) + + # Adjust the window sizes to fit the memory if even a single sequence + # cannot fit in the memory. + window_size_to_layers = self.adjust_window_sizes_for_vswa( + window_size_to_layers=window_size_to_layers, + model_config=model_config, + kv_cache_config=kv_cache_config, + pool_memory_bytes=primary_pool_memory_bytes, + kv_factor=self.kv_factor, + dtype=self.dtype, + is_cross_attention=is_cross_attention, ) blocks_per_window = KVCacheManagerCpp.calculate_max_num_blocks( config=kv_cache_config, - is_cross_attention=False, #TODO: support cross attention + # TODO: support cross attention + is_cross_attention=is_cross_attention, dtype=self.dtype, model_config=model_config, world_config=world_config_cpp, diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 9c3197e7c93..87144cb85c4 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -180,6 +180,22 @@ def str_dtype_to_torch(dtype): fp8=DataType.FP8, ) +_binding_dtype_size = { + DataType.INT64: 8, + DataType.FLOAT: 4, + DataType.INT32: 4, + DataType.BF16: 2, + DataType.HALF: 2, + DataType.BOOL: 1, + DataType.FP8: 1, + DataType.INT8: 1, + DataType.UINT8: 1, +} + + +def binding_dtype_size(dtype: DataType): + return _binding_dtype_size[dtype] + def str_dtype_to_binding(dtype): ret = _str_to_binding_dtype_dict.get(dtype) diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/test_resource_manager.py index 0632834e4e6..da1dae84ba1 100644 --- a/tests/unittest/_torch/test_resource_manager.py +++ b/tests/unittest/_torch/test_resource_manager.py @@ -10,13 +10,15 @@ import tensorrt_llm import tensorrt_llm.bindings -from tensorrt_llm._torch.pyexecutor.resource_manager import (PeftCacheConfig, +from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager, + PeftCacheConfig, PeftCacheManager) from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.bindings.internal.batch_manager import \ PeftTaskNotCachedException +DataType = tensorrt_llm.bindings.DataType LoraModule = tensorrt_llm.bindings.LoraModule LoraModuleType = tensorrt_llm.bindings.LoraModuleType current_dir = pathlib.Path(__file__).parent.resolve() @@ -66,7 +68,15 @@ def __init__(self): self.num_rnn_layers = 0 self.num_attention_heads = 1 self.hidden_size = 16 - self.data_type = tensorrt_llm.bindings.DataType.HALF + self.data_type = DataType.HALF + + @property + def num_kv_heads_per_layer(self): + return [self.num_attention_heads] * self.num_attention_layers + + @property + def head_size(self): + return self.hidden_size // self.num_attention_heads class MockPeftCacheManagerConfig: """ @@ -416,3 +426,105 @@ def test_put_get(self): self.assertEqual(entry.layer_id, expected_values[i][5]) self.assertEqual(entry.adapter_size, expected_values[i][6]) self.assertEqual(entry.num_slots, expected_values[i][7]) + + def test_adjust_window_sizes_for_vswa(self): + window_size_to_layers = { + 100: [0, 1, 2, 3], + 200: [4, 5, 6], + 7000: [7, 8], + } + + model_config = self.MockModelConfig() + model_config.num_attention_heads = 2 + model_config.hidden_size = 2 + model_config.data_type = DataType.HALF + + total_layers = [ + i for layers in window_size_to_layers.values() for i in layers + ] + + model_config.num_hidden_layers = len(total_layers) + model_config.num_attention_layers = len(total_layers) + + kv_factor = 2 + cache_bytes_per_token_per_layer = 8 + + # Define test cases: + # (memory_bytes, expected_window_sizes, max_tokens, description) + # If max_tokens is None, then it will use the default value of KvCacheConfig. + test_cases = [ + ( + # Case 1: Limited memory - windows get clamped + cache_bytes_per_token_per_layer * (100 * 9 + 30 * 5) + 4, + { + 100: [0, 1, 2, 3], + 130: [4, 5, 6, 7, 8], + }, + None, + "limited_memory_clamped_windows"), + ( + # Case 2: Less limited memory - the largest window get clamped + cache_bytes_per_token_per_layer * + (100 * 9 + 100 * 5 + 817 * 2) + 4, + { + 100: [0, 1, 2, 3], + 200: [4, 5, 6], + 1017: [7, 8], + }, + None, + "less_limited_memory_clamped_windows"), + ( + # Case 3: Sufficient memory - no clamping needed + cache_bytes_per_token_per_layer * + (100 * 4 + 200 * 3 + 7000 * 2) + 9402, + { + 100: [0, 1, 2, 3], + 200: [4, 5, 6], + 7000: [7, 8], + }, + None, + "sufficient_memory_no_clamping"), + ( + # Case 4: Very limited memory - all windows get small values + cache_bytes_per_token_per_layer * (51 * 9) + 1, + { + 51: [0, 1, 2, 3, 4, 5, 6, 7, 8], + }, + None, + "very_limited_memory_all_clamped"), + ( + # Case 5: Less limited memory but max_tokens is given. + # memory is enough for 1017 tokens, it will be clamped by max_tokens=134. + cache_bytes_per_token_per_layer * + (100 * 9 + 100 * 5 + 817 * 2) + 4, + { + 100: [0, 1, 2, 3], + 134: [4, 5, 6, 7, 8], + }, + 134, + "less_limited_memory_but_clamped_by_max_tokens"), + ] + + for memory_bytes, expected_window_sizes, max_tokens, description in test_cases: + with self.subTest(case=description, memory_bytes=memory_bytes): + kv_cache_config = tllm.KvCacheConfig(max_tokens=max_tokens) + adjusted = KVCacheManager.adjust_window_sizes_for_vswa( + window_size_to_layers=window_size_to_layers, + model_config=model_config, + kv_cache_config=kv_cache_config, + pool_memory_bytes=memory_bytes, + kv_factor=kv_factor, + dtype=model_config.data_type, + is_cross_attention=False, + ) + + self.assertEqual( + adjusted, expected_window_sizes, + f"Test case '{description}' failed.\n" + f"Memory bytes: {memory_bytes}\n" + f"Actual: {adjusted}\n" + f"Expected: {expected_window_sizes}") + + +if __name__ == "__main__": + unittest.main() From 9ebc3ab9c421c64e951daab535c27f4e7d99ce68 Mon Sep 17 00:00:00 2001 From: MinaHuai <121143971+MinaHuai@users.noreply.github.com> Date: Tue, 15 Jul 2025 22:01:35 +0800 Subject: [PATCH 16/88] [nvbugs/5385972][nvbugs/5387423][Fix] Minor fix for llava_next/llava_onevision (#5998) Signed-off-by: Mina Huai <121143971+MinaHuai@users.noreply.github.com> --- tensorrt_llm/runtime/multimodal_model_runner.py | 4 ++-- tensorrt_llm/tools/multimodal_builder.py | 4 ++-- tests/integration/test_lists/waives.txt | 4 ---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py index 9d6be2bddad..bb3a5480fcb 100644 --- a/tensorrt_llm/runtime/multimodal_model_runner.py +++ b/tensorrt_llm/runtime/multimodal_model_runner.py @@ -2647,7 +2647,7 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None): ) image = None elif self.model_type in ['llava_onevision']: - pre_prompt = "<|im_start|>user " + pre_prompt = "<|im_start|>user " + "