diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000000..733d2aced5a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,24 @@ +# Auto basic formatting when saving file with EditorConfig https://editorconfig.org/ + +# top-most EditorConfig file +root = true + +[*] +end_of_line = lf +trim_trailing_whitespace = true +insert_final_newline = true + +# make +[Makefile*] +indent_style = tab +indent_size = 4 + +# c++ +[*.{cpp,cu,h}] +indent_style = space +indent_size = 4 + +# python +[*.py] +indent_style = space +indent_size = 4 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ae3956b4430..afcf5adcda1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -107,7 +107,7 @@ /tensorrt_llm/_torch/models/modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs /tensorrt_llm/_torch/models/modeling_nemotron_h.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs /tensorrt_llm/_torch/models/modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs -/tensorrt_llm/_torch/pyexecutor/resource_manager.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-runtime-devs @NVIDIA/trt-llm-torch-models-devs +/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs /tensorrt_llm/_torch/modules/mamba @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs /tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs /tests/unittest/_torch/modeling/test_modeling_nemotron.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 43b86df937f..4a8c8e9267f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -495,6 +495,17 @@ if(ENABLE_UCX) if(NOT ${ucx_FOUND}) set(ENABLE_UCX 0) else() + if(DEFINED ENV{GITHUB_MIRROR} AND NOT "$ENV{GITHUB_MIRROR}" STREQUAL "") + if(EXISTS "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake") + file(READ "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" FILE_CONTENTS) + string( + REPLACE "https://raw.githubusercontent.com/rapidsai/rapids-cmake" + "$ENV{GITHUB_MIRROR}/rapidsai/rapids-cmake/raw/refs/heads" + FILE_CONTENTS "${FILE_CONTENTS}") + file(WRITE "${3RDPARTY_DIR}/ucxx/fetch_rapids.cmake" "${FILE_CONTENTS}") + message(WARNING "Replace UCXX fetch_rapids.cmake with internal mirror") + endif() + endif() # installing ucxx via add_subdirectory results in strange cudart linking # error, thus using their installation script to isolate the installation # process until the issue is understood. And always trigger the build so diff --git a/cpp/include/tensorrt_llm/common/assert.h b/cpp/include/tensorrt_llm/common/assert.h index e7e24bf549e..0e916b7746d 100644 --- a/cpp/include/tensorrt_llm/common/assert.h +++ b/cpp/include/tensorrt_llm/common/assert.h @@ -16,25 +16,8 @@ #pragma once -#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" -#include - -namespace tensorrt_llm::common -{ -[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info) -{ - throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); -} - -[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") -{ - throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()).c_str()); -} - -} // namespace tensorrt_llm::common - class DebugConfig { public: @@ -86,12 +69,3 @@ class DebugConfig __FILE__, __LINE__, tensorrt_llm::common::fmtstr(info, ##__VA_ARGS__).c_str()); \ } \ } while (0) - -#define TLLM_THROW(...) \ - do \ - { \ - throw NEW_TLLM_EXCEPTION(__VA_ARGS__); \ - } while (0) - -#define TLLM_WRAP(ex) \ - NEW_TLLM_EXCEPTION("%s: %s", tensorrt_llm::common::TllmException::demangle(typeid(ex).name()).c_str(), ex.what()) diff --git a/cpp/include/tensorrt_llm/common/tllmException.h b/cpp/include/tensorrt_llm/common/tllmException.h index 15a1a770190..b24e6230fd9 100644 --- a/cpp/include/tensorrt_llm/common/tllmException.h +++ b/cpp/include/tensorrt_llm/common/tllmException.h @@ -16,11 +16,22 @@ #pragma once +#include "tensorrt_llm/common/stringUtils.h" + #include #include #include #include +#define TLLM_THROW(...) \ + do \ + { \ + throw NEW_TLLM_EXCEPTION(__VA_ARGS__); \ + } while (0) + +#define TLLM_WRAP(ex) \ + NEW_TLLM_EXCEPTION("%s: %s", tensorrt_llm::common::TllmException::demangle(typeid(ex).name()).c_str(), ex.what()) + #define NEW_TLLM_EXCEPTION(...) \ tensorrt_llm::common::TllmException(__FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str()) @@ -45,4 +56,14 @@ class TllmException : public std::runtime_error int mNbFrames; }; +[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info) +{ + throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); +} + +[[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") +{ + throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info.c_str()).c_str()); +} + } // namespace tensorrt_llm::common diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index 03d03eca3af..fba7f729c90 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -1647,12 +1647,15 @@ int AttentionOp::enqueueContext(EnqueueContextParams const& params, cudaStrea // Set BMM scales for FP8 context computation params.mla_param->bmm1_scale = fmha_bmm1_scale_ptr; params.mla_param->bmm2_scale = fmha_bmm2_scale_ptr; - params.mla_param->host_bmm1_scale = decoder_params.fmhaHostBmm1Scale; params.mla_param->quant_attention_input_buf = mFP8ContextMLA ? fp8_qkv_buffer : nullptr; // Set additional scales for context phase params.mla_param->quant_scale_o = params.attention_output_orig_quant; + params.mla_param->quant_scale_q = params.kv_scale_orig_quant; + params.mla_param->quant_scale_kv = params.kv_scale_orig_quant; params.mla_param->dequant_scale_q = params.kv_scale_quant_orig; params.mla_param->dequant_scale_kv = params.kv_scale_quant_orig; + params.mla_param->host_bmm1_scale + = 1 / (mQScaling * sqrt((float) (mMLAParams.qk_nope_head_dim + mMLAParams.qk_rope_head_dim))); if (mPagedContextFMHA && mPagedKVCache) { TLLM_CHECK_WITH_INFO(params.mla_param->context_paged_kv_ptr != nullptr, diff --git a/cpp/tensorrt_llm/common/workspace.h b/cpp/tensorrt_llm/common/workspace.h index 1406e821333..0dd32ed16d8 100644 --- a/cpp/tensorrt_llm/common/workspace.h +++ b/cpp/tensorrt_llm/common/workspace.h @@ -20,7 +20,8 @@ namespace tensorrt_llm::common { -std::uintptr_t constexpr kCudaMemAlign = 128; +// CuBLAS >= 12.9.1 requires 256-byte alignment. +std::uintptr_t constexpr kCudaMemAlign = 256; inline int8_t* alignPtr(int8_t* ptr, uintptr_t to) { diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu index cac0e8f0513..cdb7abbb91b 100644 --- a/cpp/tensorrt_llm/kernels/mlaKernels.cu +++ b/cpp/tensorrt_llm/kernels/mlaKernels.cu @@ -207,8 +207,9 @@ inline __device__ void dequantCopy( template __global__ void applyMLARopeAndAssignQKVKernelOptContext(T* qkv_output, T const* fuse_buf, KVCacheBuffer kv_cache, float2 const* cos_sin_cache, size_t head_num, int head_size, int c_k, int* cu_q_seqlens, - int32_t const* kv_cache_lengths, uint32_t max_input_seq_len, KvCacheDataType cache_type, - float const* quant_scale_kv) + int32_t const* kv_cache_lengths, uint32_t max_input_seq_len, KvCacheDataType cache_type, float* bmm1_scale, + float* bmm2_scale, float const* quant_scale_o, float const* quant_scale_kv, float const* dequant_scale_q, + float const* dequant_scale_kv, float host_bmm1_scale) { // Constants. @@ -231,6 +232,32 @@ __global__ void applyMLARopeAndAssignQKVKernelOptContext(T* qkv_output, T const* size_t const batch_idx = blockIdx.y; size_t const head_idx = blockIdx.z; + if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) + { + + // Calculate bmm scale for FP8 MLA + if (cache_type == KvCacheDataType::FP8) + { + float dequant_scale_q_val = dequant_scale_q ? dequant_scale_q[0] : 1.f; + float dequant_scale_kv_val = dequant_scale_kv ? dequant_scale_kv[0] : 1.f; + float quant_scale_o_val = quant_scale_o ? quant_scale_o[0] : 1.f; + if (bmm1_scale) + { + // The scale prepared for log2 optimization. + constexpr float kLog2e = 1.4426950408889634074f; + // The scale after fmha bmm1. + float bmm1_scale_val = dequant_scale_q_val * dequant_scale_kv_val * host_bmm1_scale; + bmm1_scale[0] = bmm1_scale_val; + bmm1_scale[1] = bmm1_scale_val * kLog2e; + } + if (bmm2_scale) + { + // The scale after fmha bmm2. + bmm2_scale[0] = quant_scale_o_val * dequant_scale_kv_val; + } + } + } + if (head_idx < head_num) { size_t const head_dim_vec_idx = (threadIdx.x % VECS_PER_HEAD); @@ -919,10 +946,11 @@ void invokeMLARopeContext(MlaParams& params, KVCacheBuffer kv_cache_buffer, c { dim3 grid(int(tensorrt_llm::common::divUp(params.max_input_seq_len, 32)), params.batch_size, params.head_num + 8); auto head_size = params.meta.qk_nope_head_dim; - applyMLARopeAndAssignQKVKernelOptContext - <<>>(params.attention_input_buf, params.latent_cache, kv_cache_buffer, - params.cos_sin_cache, params.head_num, head_size, params.meta.kv_lora_rank, params.cu_q_seqlens, - params.cache_seq_lens, params.max_input_seq_len, params.cache_type, params.quant_scale_kv); + applyMLARopeAndAssignQKVKernelOptContext<<>>( + params.attention_input_buf, params.latent_cache, kv_cache_buffer, params.cos_sin_cache, params.head_num, + head_size, params.meta.kv_lora_rank, params.cu_q_seqlens, params.cache_seq_lens, params.max_input_seq_len, + params.cache_type, params.bmm1_scale, params.bmm2_scale, params.quant_scale_o, params.quant_scale_kv, + params.dequant_scale_q, params.dequant_scale_kv, params.host_bmm1_scale); if (params.attention_input_buf != nullptr && params.quant_attention_input_buf != nullptr && params.cache_type == KvCacheDataType::FP8) { diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index ce0f94ac0eb..70fcc46df74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -102,9 +102,9 @@ class TllmGenFmhaKernel int headDimPerCtaV, int headDimQk, int headDimV, int tileSizeKv, int numTokensPerPage, int maxNumHeadsQPerKvInCta, bool reuseSmemKForV, bool uses2CtaMma) const { - TLLM_CHECK_WITH_INFO((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) && (headDimPerCtaV <= 2048) - && (headDimQk <= 2048) && (headDimV <= 2048) && (numTokensPerPage <= 128), - "Expect (32 <= headDim <= 2048) && (numTokensPerPage <= 128), got headDimPerCtaV=%d, headDimQk=%d, " + TLLM_CHECK_WITH_INFO((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) && (headDimPerCtaV <= 1024) + && (headDimQk <= 1024) && (headDimV <= 1024) && (numTokensPerPage <= 128), + "Expect (32 <= headDim <= 1024) && (numTokensPerPage <= 128), got headDimPerCtaV=%d, headDimQk=%d, " "headDimV=%d, numTokensPerPage=%d", headDimPerCtaV, headDimQk, headDimV, numTokensPerPage); TLLM_CHECK_WITH_INFO(maxNumHeadsQPerKvInCta <= 128, "The maxNumHeadsQPerKvInCta <= 128 is required."); @@ -115,19 +115,19 @@ class TllmGenFmhaKernel // Bit 8 - 11: kernelType. // Bit 12 - 15: tileScheduler. // Bit 16 - 17: multiCtasKvMode. - // Bit 18 - 24: (headDimPerCtaV >> 5). - // Bit 25 - 31: (headDimQk >> 5). - // Bit 32 - 38: (headDimV >> 5). - // Bit 39 - 40: (tileSizeKv >> 6). - // Bit 41 - 48: numTokensPerPage. + // Bit 18 - 25: (headDimPerCtaV >> 3). + // Bit 26 - 33: (headDimQk >> 3). + // Bit 34 - 41: (headDimV >> 3). + // Bit 42 - 43: (tileSizeKv >> 6). + // Bit 44 - 48: (numTokensPerPage >> 3). // Bit 49 - 56: maxNumHeadsQPerKvInCta. // Bit 57 - 57: reuseSmemKForV. // Bit 58 - 58: uses2CtaMma. return (static_cast(qkvLayout) << 0) | (static_cast(maskType) << 4) | (static_cast(kernelType) << 8) | (static_cast(scheduler) << 12) - | (static_cast(multiCtasKvMode) << 16) | (static_cast(headDimPerCtaV >> 5) << 18) - | (static_cast(headDimQk >> 5) << 25) | (static_cast(headDimV >> 5) << 32) - | (static_cast(tileSizeKv >> 6) << 39) | (static_cast(numTokensPerPage) << 41) + | (static_cast(multiCtasKvMode) << 16) | (static_cast(headDimPerCtaV >> 3) << 18) + | (static_cast(headDimQk >> 3) << 26) | (static_cast(headDimV >> 3) << 34) + | (static_cast(tileSizeKv >> 6) << 42) | (static_cast(numTokensPerPage >> 3) << 44) | (static_cast(maxNumHeadsQPerKvInCta) << 49) | (static_cast(reuseSmemKForV) << 57) | (static_cast(uses2CtaMma) << 58); } @@ -142,6 +142,17 @@ class TllmGenFmhaKernel std::pair checkIfKernelExist(RunnerParams const& params) const { + // Some conditions to check if the kernel is supported. + // This is meant to avoid occupying unnecessary hashId bits. + if (params.mHeadDimQk % 8 != 0 || params.mHeadDimV % 8 != 0) + { + return std::make_pair(false, "HeadDimQk and HeadDimV must be divisible by 8"); + } + if (params.mNumTokensPerPage % 8 != 0) + { + return std::make_pair(false, "NumTokensPerPage must be divisible by 8"); + } + // The selectKernelParams that might be updated. SelectKernelParams selectKernelParams{params}; auto [hashId, info] = hashFromRunnerParams(params, selectKernelParams); @@ -347,6 +358,11 @@ class TllmGenFmhaKernel selectKernelParams.mTileScheduler = TileScheduler::Persistent; // Need to select a different kernel. selectKernelParams.mSelectNewKernel = true; + // FIXME(perkz): use static scheduler instead as WAR for https://nvbugspro.nvidia.com/bug/5394685. + if (selectKernelParams.mUses2CtaMma) + { + selectKernelParams.mTileScheduler = TileScheduler::Static; + } } else if (totalNumCtas < params.mMultiProcessorCount && isMlaGenKernel(params) && selectKernelParams.mTileSizeKv == 128 && tensorrt_llm::common::getEnvUseTileSizeKv64ForTrtllmGen()) diff --git a/cpp/tests/unit_tests/runtime/decodingLayerWorkspaceTest.cpp b/cpp/tests/unit_tests/runtime/decodingLayerWorkspaceTest.cpp index 066ad5a8ca2..bb6ce6410ad 100644 --- a/cpp/tests/unit_tests/runtime/decodingLayerWorkspaceTest.cpp +++ b/cpp/tests/unit_tests/runtime/decodingLayerWorkspaceTest.cpp @@ -16,6 +16,7 @@ #include "tensorrt_llm/runtime/decodingLayerWorkspace.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/workspace.h" #include #include @@ -171,7 +172,7 @@ TEST_P(MirrorInWorkspaceTest, TestMirrorInWorkspaceFunctionality) requiredWorkspaceSize) << "The calculated workspace size cannot possibly be enough to contain all the tensors."; - constexpr std::size_t addressAlignment = 128; + constexpr std::size_t addressAlignment = tensorrt_llm::common::kCudaMemAlign; constexpr std::size_t numTensors = 3; constexpr std::size_t maxAlignmentOverhead = numTensors * addressAlignment; ASSERT_GE(hostTensor1->getSizeInBytes() + hostTensor2->getSizeInBytes() + hostTensor3->getSizeInBytes() diff --git a/docker/Makefile b/docker/Makefile index 8432710af43..b95ea971ef3 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -1,6 +1,8 @@ # Default base image for the docker build as defined in Dockerfile.multi BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' Dockerfile.multi | grep -o '=.*' | tr -d '="') BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' Dockerfile.multi | grep -o '=.*' | tr -d '="') +TRITON_IMAGE ?= $(shell grep '^ARG TRITON_IMAGE=' Dockerfile.multi | grep -o '=.*' | tr -d '="') +TRITON_BASE_TAG ?= $(shell grep '^ARG TRITON_BASE_TAG=' Dockerfile.multi | grep -o '=.*' | tr -d '="') # Name of the new image IMAGE_NAME ?= tensorrt_llm IMAGE_TAG ?= latest @@ -80,6 +82,8 @@ endef --progress $(DOCKER_PROGRESS) \ $(if $(BASE_IMAGE), --build-arg BASE_IMAGE=$(BASE_IMAGE)) \ $(if $(BASE_TAG), --build-arg BASE_TAG=$(BASE_TAG)) \ + $(if $(TRITON_IMAGE), --build-arg TRITON_IMAGE=$(TRITON_IMAGE)) \ + $(if $(TRITON_BASE_TAG), --build-arg TRITON_BASE_TAG=$(TRITON_BASE_TAG)) \ $(if $(BUILD_WHEEL_ARGS), --build-arg BUILD_WHEEL_ARGS="$(BUILD_WHEEL_ARGS)") \ $(if $(BUILD_WHEEL_SCRIPT), --build-arg BUILD_WHEEL_SCRIPT="$(BUILD_WHEEL_SCRIPT)") \ $(if $(TORCH_INSTALL_TYPE), --build-arg TORCH_INSTALL_TYPE="$(TORCH_INSTALL_TYPE)") \ @@ -187,16 +191,16 @@ jenkins-aarch64_%: STAGE = tritondevel jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_VERSION}),PY312,$(if $(findstring 3.10,${PYTHON_VERSION}),PY310,$(error Unknown PYTHON_VERSION specified))) jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE) jenkins-rockylinux8_%: STAGE = tritondevel -jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda +jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda jenkins-rockylinux8_%: BASE_TAG = 12.9.1-devel-rockylinux8 rockylinux8_%: STAGE = tritondevel -rockylinux8_%: BASE_IMAGE = nvidia/cuda +rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda rockylinux8_%: BASE_TAG = 12.9.1-devel-rockylinux8 # For x86_64 and aarch64 ubuntu22_%: STAGE = tritondevel -ubuntu22_%: BASE_IMAGE = nvidia/cuda +ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda ubuntu22_%: BASE_TAG = 12.9.1-devel-ubuntu22.04 trtllm_%: STAGE = release diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh index 6d118b62c45..8caf9896901 100644 --- a/docker/common/install_tensorrt.sh +++ b/docker/common/install_tensorrt.sh @@ -12,9 +12,7 @@ CUDNN_VER="9.10.2.21-1" # NGC PyTorch 25.06 image uses NCCL 2.27.3, while NCCL 2.27.5 resolves a perf regression issue. # Use NCCL version 2.27.5 instead. NCCL_VER="2.27.5-1+cuda12.9" -# NGC PyTorch 25.06 image uses cuBLAS 12.9.1.4, but which leads to failures with MoE Lora (see https://nvbugs/5376270). -# Continue using cuBLAS 12.9.0.13 until this issue is resolved. -CUBLAS_VER="12.9.0.13-1" +CUBLAS_VER="12.9.1.4-1" # Align with the pre-installed CUDA / NVCC / NVRTC versions from # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html NVRTC_VER="12.9.86-1" diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index 05d18284a06..d3a115ef14b 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -412,9 +412,10 @@ Generally, you should make sure that `max_batch_size` is not too low to bottlene For more details on `max_batch_size` and `max_num_tokens`, refer to [Tuning Max Batch Size and Max Num Tokens](../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md). -### Not supported: MLA chunked context support on Hopper +### MLA chunked context + +MLA currently supports the chunked context feature on both Hopper and Blackwell GPUs. You can use `--enable_chunked_context` to enable it. This feature is primarily designed to reduce TPOT (Time Per Output Token). The default chunk size is set to `max_num_tokens`. If you want to achieve a lower TPOT, you can appropriately reduce the chunk size. However, please note that this will also decrease overall throughput. Therefore, a trade-off needs to be considered. -MLA chunked context support has been added on Blackwell GPUs, while it's not supported on Hopper yet. On Hopper, note that `max_num_tokens` has to be at least larger than the max input sequence length of the samples in dataset. For more details on `max_num_tokens`, refer to [Tuning Max Batch Size and Max Num Tokens](../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md). ### Out of memory issues diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py index 65c2f5a644c..61240b496de 100644 --- a/examples/llm-api/quickstart_advanced.py +++ b/examples/llm-api/quickstart_advanced.py @@ -65,9 +65,9 @@ def add_llm_args(parser): parser.add_argument('--attention_dp_batching_wait_iters', type=int, default=0) - parser.add_argument('--use_torch_sampler', - default=False, - action='store_true') + parser.add_argument('--sampler_type', + default="auto", + choices=["auto", "TorchSampler", "TRTLLMSampler"]) parser.add_argument('--tp_size', type=int, default=1) parser.add_argument('--pp_size', type=int, default=1) parser.add_argument('--moe_ep_size', type=int, default=-1) @@ -108,6 +108,9 @@ def add_llm_args(parser): default=False, action='store_true', help='Use piecewise CUDA graph to optimize the model') + parser.add_argument('--apply_chat_template', + default=False, + action='store_true') # Sampling parser.add_argument("--max_tokens", type=int, default=64) @@ -227,7 +230,7 @@ def setup_llm(args, **kwargs): args.use_piecewise_cuda_graph) if args.use_torch_compile else None, moe_config=MoeConfig(backend=args.moe_backend), - use_torch_sampler=args.use_torch_sampler, + sampler_type=args.sampler_type, max_seq_len=args.max_seq_len, max_batch_size=args.max_batch_size, max_num_tokens=args.max_num_tokens, @@ -273,6 +276,15 @@ def main(): prompts = args.prompt if args.prompt else example_prompts llm, sampling_params = setup_llm(args) + new_prompts = [] + if args.apply_chat_template: + for prompt in prompts: + messages = [{"role": "user", "content": f"{prompt}"}] + new_prompts.append( + llm.tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True)) + prompts = new_prompts outputs = llm.generate(prompts, sampling_params) for i, output in enumerate(outputs): diff --git a/examples/llm-api/star_attention.py b/examples/llm-api/star_attention.py index e6071054fe4..367f7cc8435 100644 --- a/examples/llm-api/star_attention.py +++ b/examples/llm-api/star_attention.py @@ -7,6 +7,7 @@ import torch from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm.mapping import CpType from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig @@ -59,7 +60,7 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False): kv_cache_quant_algo=QuantAlgo.FP8 if fp8_kv_cache else None) if fp8 else QuantConfig() cp_config = { - "cp_type": "star_attention", + "cp_type": CpType.STAR, "cp_anchor_size": args.sa_anchor_size, "block_size": args.sa_block_size } diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index 2efe14b986d..b15d0785190 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -786,7 +786,7 @@ The converted checkpoint could be used as `` and consumed by oth KV cache reuse is supported for MLA on SM90 and SM100. It is enabled by default. Due to extra operations like memcpy and GEMMs, GPU memory consumption may be higher and the E2E performance may have regression in some cases. Users could pass `KvCacheConfig(enable_block_reuse=False)` to LLM API to disable it. ### Chunked Prefill -Chunked Prefill is supported for MLA only on SM100 currently. You should add `--enable_chunked_prefill` to enable it. The GPU memory consumption is highly correlated with `max_num_tokens` and `max_batch_size`. If encountering out-of-memory errors, you may make these values smaller. (`max_num_tokens` must be divisible by kv cache's `tokens_per_block`) +Chunked Prefill is supported for MLA only on SM90 and SM100 currently. You should add `--enable_chunked_prefill` to enable it. The GPU memory consumption is highly correlated with `max_num_tokens` and `max_batch_size`. If encountering out-of-memory errors, you may make these values smaller. (`max_num_tokens` must be divisible by kv cache's `tokens_per_block`) More specifically, we can imitate what we did in the [Quick Start](#quick-start): diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md index bb21a6ff9a3..cda0086efee 100644 --- a/examples/models/core/gpt_oss/README.md +++ b/examples/models/core/gpt_oss/README.md @@ -6,12 +6,16 @@ GPT-OSS is a reasoning model with MoE weights quantized with mxfp4. All the othe ## MoE Support Matrix -In MoE, the weights are pre-quantized to mxfp4. The activation can be in either bf16 (Hopper) or mxfp8 (Blackwell), with similar accuracy. - -| device | Activation | Weight | Supported moe_backend | -|----------|----------|----------|----------| -| Hopper | bf16 | mxfp4 | **TRITON**, CUTLASS | -| Blackwell | mxfp8 | mxfp4 | CUTLASS, TRTLLM | +In MoE, the weights are pre-quantized to mxfp4. The activation can be in either bf16 (Hopper) or mxfp8 (Blackwell), with similar accuracy. FP8 activation with per-tensor scaling factor has limited support. Note that the per-tensor scaling factor needs to be calculated dynamically during inference with the official mxfp4 checkpoints, which may negatively impact perf. The configs in **bold** are the recommended configs for the official checkpoints. + +| device | Activation | Weight | Supported moe_backend | MMA| +|----------|----------|----------|----------|----------| +| Hopper | **bf16** | mxfp4 | **TRITON**, CUTLASS | simulated mxfp4, HGMMA | +| Hopper | fp8 | mxfp4 | CUTLASS (not enabled) | simulated mxfp4, QGMMA | +| Blackwell | **mxfp8** | mxfp4 | **CUTLASS, TRTLLM** | UTCQMMA | +| Blackwell | fp8 | mxfp4 | CUTLASS, TRTLLM | UTCQMMA | +| Blackwell | fp8 | mxfp4 | TRITON (experimental) | NA | +| Blackwell | bf16 | mxfp4 | TRTLLM | simulated mxfp4, UTCHMMA | | moe_backend | TP | EP | AlltoAll | diff --git a/examples/wide_ep/slurm_scripts/submit.sh b/examples/wide_ep/slurm_scripts/submit.sh index b34077a8315..f5b887f812c 100644 --- a/examples/wide_ep/slurm_scripts/submit.sh +++ b/examples/wide_ep/slurm_scripts/submit.sh @@ -12,7 +12,7 @@ model_dir= # Path to the model checkpoint repo_dir= # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used mtp_size=0 -ntasks_per_node=4 # 4 GPUs per GB200 node +ntasks_per_node=4 # 4 GPUs per GB200 node, 8 GPUs per B200 node isl=1024 osl=1024 @@ -23,8 +23,9 @@ streaming=true for b in 1 64 1024; do for eplb_num_slots in 0 256 288; do concurrency=$((b * 16)) - ctx_num=$(((concurrency + 5499)/5500)) - total_node_num=$((ctx_num + 4)) + ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200 + ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200 + total_node_num=$((ctx_node_num + 4)) # $((ctx_node_num + 2)) for B200 ntasks=$((total_node_num * ntasks_per_node)) args=( @@ -58,8 +59,9 @@ done # dep32 eplb288 for b in 512; do concurrency=$((b * 32)) - ctx_num=$(((concurrency + 5499)/5500)) - total_node_num=$((ctx_num + 8)) + ctx_node_num=$(((concurrency + 5499)/5500)) # $(((concurrency + 10999)/11000)) for B200 + ctx_num=${ctx_node_num} # $((ctx_node_num * 2)) for B200 + total_node_num=$((ctx_node_num + 8)) # $((ctx_node_num + 4)) for B200 ntasks=$((total_node_num * ntasks_per_node)) eplb_num_slots=288 diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 5aa61708f5b..64e03de476a 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -258,7 +258,7 @@ def buildImage(config, imageKeyToTag) // Step 2: Build the images stage ("Install packages") { sh "pwd && ls -alh" - sh "env" + sh "env | sort" sh "apk add make git" sh "git config --global --add safe.directory '*'" @@ -281,23 +281,31 @@ def buildImage(config, imageKeyToTag) try { def build_jobs = BUILD_JOBS // Fix the triton image pull timeout issue - def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() - def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep 'ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + + if (target == "rockylinux8") { + BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim() + } + + // Replace the base image and triton image with the internal mirror + BASE_IMAGE = BASE_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/") + TRITON_IMAGE = TRITON_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/") if (dependent) { stage ("make ${dependent.target}_${action} (${arch})") { - retry(3) { - sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}" - } - retry(3) { - sh """ - cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \ - TORCH_INSTALL_TYPE=${torchInstallType} \ - IMAGE_WITH_TAG=${dependentImageWithTag} \ - STAGE=${dependent.dockerfileStage} \ - BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} - """ - } + def randomSleep = (Math.random() * 300 + 300).toInteger() + trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200) + trtllm_utils.llmExecStepWithRetry(this, script: """ + cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \ + BASE_IMAGE=${BASE_IMAGE} \ + TRITON_IMAGE=${TRITON_IMAGE} \ + TORCH_INSTALL_TYPE=${torchInstallType} \ + IMAGE_WITH_TAG=${dependentImageWithTag} \ + STAGE=${dependent.dockerfileStage} \ + BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} + """, sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200) args += " DEVEL_IMAGE=${dependentImageWithTag}" if (target == "ngc-release") { imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag @@ -315,18 +323,18 @@ def buildImage(config, imageKeyToTag) } } stage ("make ${target}_${action} (${arch})") { - retry(3) { - sh "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}" - } - retry(3) { - sh """ - cd ${LLM_ROOT} && make -C docker ${target}_${action} \ - TORCH_INSTALL_TYPE=${torchInstallType} \ - IMAGE_WITH_TAG=${imageWithTag} \ - STAGE=${dockerfileStage} \ - BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} - """ - } + sh "env | sort" + def randomSleep = (Math.random() * 300 + 300).toInteger() + trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, shortCommondRunTimeMax: 7200) + trtllm_utils.llmExecStepWithRetry(this, script: """ + cd ${LLM_ROOT} && make -C docker ${target}_${action} \ + BASE_IMAGE=${BASE_IMAGE} \ + TRITON_IMAGE=${TRITON_IMAGE} \ + TORCH_INSTALL_TYPE=${torchInstallType} \ + IMAGE_WITH_TAG=${imageWithTag} \ + STAGE=${dockerfileStage} \ + BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} + """, sleepInSecs: randomSleep, numRetries: 3, shortCommondRunTimeMax: 7200) if (target == "ngc-release") { imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag } @@ -336,6 +344,8 @@ def buildImage(config, imageKeyToTag) stage ("custom tag: ${customTag} (${arch})") { sh """ cd ${LLM_ROOT} && make -C docker ${target}_${action} \ + BASE_IMAGE=${BASE_IMAGE} \ + TRITON_IMAGE=${TRITON_IMAGE} \ TORCH_INSTALL_TYPE=${torchInstallType} \ IMAGE_WITH_TAG=${customImageWithTag} \ STAGE=${dockerfileStage} \ diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index b40c7a11a7e..3ed53788815 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312 // DLFW torch image -DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.06-py3" +DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3" //Ubuntu base image UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04" @@ -2075,6 +2075,11 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) checkPipStage = true } + if (cpu_arch == AARCH64_TRIPLE && values[5] != DLFW_IMAGE) { + checkPipStage = false + echo "Skip pip install sanity check due to https://nvbugs/5453827" + } + if (checkPipStage) { stage("Run LLMAPI tests") { pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) diff --git a/jenkins/controlCCache.groovy b/jenkins/controlCCache.groovy index 82fa7757ad0..bc34d88e4d0 100644 --- a/jenkins/controlCCache.groovy +++ b/jenkins/controlCCache.groovy @@ -1,7 +1,7 @@ import java.lang.InterruptedException -DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507251001-5678" +DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501" def createKubernetesPodConfig(image, arch = "amd64") { diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index dee2ee7218f..751f2516358 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -11,7 +11,7 @@ # # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508051130-6090 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508051130-6090 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501 diff --git a/pyproject.toml b/pyproject.toml index edc6fbcf8a1..3ee28a7e166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ ############################### BUILD CONFIGURATION ############################################## #################################################################################################### [build-system] -requires = ["setuptools >= 64"] +requires = ["setuptools >= 64", "pip >= 24"] build-backend = "setuptools.build_meta" #################################################################################################### diff --git a/setup.py b/setup.py index d3293c4beed..b16d9ce4704 100644 --- a/setup.py +++ b/setup.py @@ -120,14 +120,14 @@ def has_ext_modules(self): ] -def download_precompiled(workspace: str) -> str: +def download_precompiled(workspace: str, version: str) -> str: import glob import subprocess from setuptools.errors import SetupError cmd = [ - "pip", "download", f"tensorrt_llm={get_version()}", + "python3", "-m", "pip", "download", f"tensorrt_llm=={version}", f"--dest={workspace}", "--no-deps", "--extra-index-url=https://pypi.nvidia.com" ] @@ -201,17 +201,18 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str], wheel.extract(file) -use_precompiled: bool = os.getenv("TRTLLM_USE_PRECOMPILED") == "1" -precompiled_location: str = os.getenv("TRTLLM_PRECOMPILED_LOCATION") - -if precompiled_location: - use_precompiled = True +precompiled: str | None = os.getenv("TRTLLM_USE_PRECOMPILED") +precompiled_location: str | None = os.getenv("TRTLLM_PRECOMPILED_LOCATION") +use_precompiled: bool = (precompiled is not None + and precompiled != "0") or (precompiled_location + is not None) if use_precompiled: from tempfile import TemporaryDirectory with TemporaryDirectory() as tempdir: if not precompiled_location: - precompiled_location = download_precompiled(tempdir) + version = precompiled if precompiled != "1" else get_version() + precompiled_location = download_precompiled(tempdir, version) extract_from_precompiled(precompiled_location, package_data, tempdir) sanity_check() diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index 4173b338c22..8bd4c49bbc9 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -342,6 +342,7 @@ def __call__(self, position_ids: torch.Tensor, q: torch.Tensor, class RopeParams: dim: int = 0 theta: float = 10000.0 + alpha: float = 1.0 scale_type: RotaryScalingType = RotaryScalingType.none scale: float = 1.0 low_freq_factor: float = 1.0 @@ -384,6 +385,7 @@ def from_config(config) -> "RopeParams": rope_params.scale_type = RotaryScalingType.none rope_params.scale = 1.0 if rope_scaling is not None: + rope_params.alpha = rope_scaling.get("alpha", 1.0) rotary_scaling_type = rope_scaling.get( "type", None) or rope_scaling.get("rope_type") rope_params.scale_type = RotaryScalingType.from_string( @@ -462,6 +464,7 @@ def create_rope_const_params(self, interleave: bool = True): self.scale_type, rope_scaling_config={ "factor": self.scale, + "alpha": self.alpha, "low_freq_factor": self.low_freq_factor, "high_freq_factor": self.high_freq_factor, "original_max_position_embeddings": diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index c49d7806ee2..74ac9590a38 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -455,8 +455,7 @@ def __init__(self, self.workspace = get_allreduce_workspace(self.mapping) # Initialize MNNVL AllReduce if needed - if self.strategy in (AllReduceStrategy.AUTO, - AllReduceStrategy.MNNVL): + if self.strategy == AllReduceStrategy.MNNVL: if MNNVLAllReduce.is_mnnvl(self.mapping, dtype): try: self.mnnvl_allreduce = MNNVLAllReduce( diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index 5bc9e7870f4..7e310f934ac 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -143,6 +143,14 @@ def get_all_reduce_strategy(strategy: str = "AUTO"): self.allreduce_strategy = get_all_reduce_strategy( self.allreduce_strategy) + @property + def torch_dtype(self) -> torch.dtype: + """Get the torch dtype of the model.""" + # TODO: this is an assumption that a HF model is always in bfloat16 + # We should figure out a better way to handle this if other models + # start to not report dtype. + return self.pretrained_config.torch_dtype or torch.bfloat16 + @property def fuse_pos_embd(self): if self.attn_backend == 'TRTLLM': @@ -154,8 +162,9 @@ def fuse_pos_embd(self): @property def enable_flash_mla(self): if self.attn_backend == 'TRTLLM': - if hasattr(self.pretrained_config, "kv_lora_rank") and hasattr( - self.pretrained_config, "qk_rope_head_dim"): + if getattr(self.pretrained_config, + "kv_lora_rank", None) and getattr( + self.pretrained_config, "qk_rope_head_dim", None): head_dim = self.pretrained_config.kv_lora_rank + self.pretrained_config.qk_rope_head_dim if head_dim == 576 and torch.cuda.get_device_capability() == ( 9, 0): diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py index 6d6b12d06ab..4f7aa39330e 100644 --- a/tensorrt_llm/_torch/models/__init__.py +++ b/tensorrt_llm/_torch/models/__init__.py @@ -8,6 +8,7 @@ from .modeling_gemma3 import Gemma3ForCausalLM from .modeling_gemma3vl import Gemma3VLM from .modeling_gpt_oss import GptOssForCausalLM +from .modeling_hunyuan_moe import HunYuanMoEV1ForCausalLM from .modeling_hyperclovax import HCXVisionForCausalLM from .modeling_llama import LlamaForCausalLM from .modeling_llava_next import LlavaNextModel @@ -38,6 +39,7 @@ "Gemma3ForCausalLM", "Gemma3VLM", "HCXVisionForCausalLM", + "HunYuanMoEV1ForCausalLM", "LlamaForCausalLM", "LlavaNextModel", "Mistral3VLM", diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index a826a8e993d..5210d341d46 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -53,7 +53,6 @@ from ..distributed import (AllReduce, AllReduceFusionOp, AllReduceParams, MoEAllReduce, MoEAllReduceParams, allgather) from ..model_config import ModelConfig -from ..models.modeling_utils import ModelConfig, QuantConfig from ..modules.attention import MLA from ..modules.decoder_layer import DecoderLayer from ..modules.embedding import Embedding @@ -66,10 +65,10 @@ from ..modules.multi_stream_utils import maybe_execute_in_parallel from ..modules.rms_norm import RMSNorm from ..peft.lora.layer import LoraLayer -from ..speculative import MTPEagleWorker, MTPSpecMetadata, MTPWorker +from ..speculative import MTPSpecMetadata, SpecMetadata from ..utils import AuxStreamType, EventType, Fp4QuantizedTensor -from .modeling_utils import (DecoderModel, DecoderModelForCausalLM, - EagerFusionConfig, filter_weights, +from .modeling_speculative import SpecDecOneEngineForCausalLM +from .modeling_utils import (DecoderModel, EagerFusionConfig, filter_weights, register_auto_model) @@ -541,7 +540,8 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4, router_logits = self.gate(hidden_states) routed_output = self.experts( - hidden_states_fp4 or hidden_states, + hidden_states_fp4 + if hidden_states_fp4 is not None else hidden_states, router_logits, do_finalize=do_finalize, output_dtype=hidden_states.dtype, @@ -565,8 +565,9 @@ def forward( assert not self.use_dp def _compute_shared_output(): - shared_output = self.shared_experts(hidden_states_fp4 - or hidden_states) + shared_output = self.shared_experts( + hidden_states_fp4 + if hidden_states_fp4 is not None else hidden_states) if self.shared_output_scale is not None: shared_output *= self.shared_output_scale return shared_output @@ -750,7 +751,7 @@ def forward( attn_metadata: AttentionMetadata, residual: torch.Tensor, **kwargs, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -782,7 +783,7 @@ def forward_MoE( hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, residual: torch.Tensor, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: def _run_MoE(hidden_states, hidden_states_fp4, do_finalize): return self.mlp( @@ -866,7 +867,7 @@ def forward_mlp( self, hidden_states: torch.Tensor, residual: torch.Tensor, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: if self.fusion_config.PRE_MLP_FUSION: act_fp4, act_sf, residual = self.allreduce( @@ -970,7 +971,7 @@ def forward( all_rank_num_tokens: Optional[List[int]] = None, all_rank_max_num_tokens: Optional[int] = None, **kwargs, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: def norm_embeds(): return self.enorm(embed_tokens(input_ids)) #emdedding @@ -1085,6 +1086,8 @@ def forward( input_ids: Optional[torch.IntTensor] = None, position_ids: Optional[torch.IntTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, + spec_metadata: Optional[SpecMetadata] = None, + **kwargs, ) -> torch.Tensor: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( @@ -1109,8 +1112,8 @@ def forward( @register_auto_model("DeepseekV3ForCausalLM") -class DeepseekV3ForCausalLM(DecoderModelForCausalLM[DeepseekV3Model, - PretrainedConfig]): +class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model, + PretrainedConfig]): def __init__(self, model_config: ModelConfig[PretrainedConfig]): # Rename some keys of quant_config_dict to support legacy checkpoints @@ -1125,10 +1128,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]): model_config._frozen = False model_config.quant_config_dict = quant_config_dict model_config._frozen = True - super().__init__(DeepseekV3Model(model_config), - config=model_config, - hidden_size=model_config.pretrained_config.hidden_size, - vocab_size=model_config.pretrained_config.vocab_size) + + super().__init__(model=DeepseekV3Model(model_config), + model_config=model_config) self.model_nextn = 0 if model_config.spec_config is not None: @@ -1138,23 +1140,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]): assert ckpt_nextn > 0, "There is not MTP modules in the checkpoint." if ckpt_nextn == 1 and not model_config.spec_config.use_mtp_vanilla: moe_load_balancer_set_repeated_for_next_layer(model_nextn) - mtp_layer = DeepseekV3MTP(model_config, self.num_hidden_layers, - self.model.aux_stream_dict) - self.model.layers.append(mtp_layer) - self.epilogue.append(mtp_layer) - self.mtp_worker = MTPEagleWorker(model_config.spec_config, - model_config) else: - mtp_layers = nn.ModuleList([ - DeepseekV3MTP(model_config, - layer_idx + self.num_hidden_layers, - self.model.aux_stream_dict) - for layer_idx in range(model_nextn) - ]) - self.model.layers.extend(mtp_layers) - self.epilogue.extend(mtp_layers) - self.mtp_worker = MTPWorker(model_config.spec_config, - model_config) # modify the QuantConfig to support duplicated mtp layers if model_config.quant_config.exclude_modules is not None: extend_exclude_modules = [] @@ -1172,7 +1158,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]): ckpt_prefix, model_prefix)) self.model_config.quant_config.exclude_modules.extend( extend_exclude_modules) - self.epilogue.append(self.mtp_worker) + self.model.layers.extend(self.draft_model.mtp_layers) + self.epilogue.extend(self.draft_model.mtp_layers) + self.epilogue.append(self.spec_worker) def forward( self, @@ -1185,40 +1173,13 @@ def forward( **kwargs, ) -> torch.Tensor: attn_metadata.num_generations_per_batch = self.model_nextn + 1 - hidden_states = self.model( - input_ids=input_ids, - attn_metadata=attn_metadata, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - ) - - if spec_metadata and spec_metadata.spec_dec_mode.is_mtp(): - # get logits - logits = self.logits_processor.forward( - hidden_states[spec_metadata.gather_ids], - self.lm_head, - attn_metadata, - True, - ) - # get accepted tokens and next draft tokens - return self.mtp_worker( - input_ids=input_ids, - position_ids=position_ids, - hidden_states=hidden_states, - logits=logits, - lm_head=self.lm_head, - embed_tokens=self.model.embed_tokens, - attn_metadata=attn_metadata, - spec_metadata=spec_metadata, - mtp_layers=self.model.layers[self.num_hidden_layers:]) - else: - logits = self.logits_processor.forward( - hidden_states, - self.lm_head, - attn_metadata, - return_context_logits, - ) - return logits + return super().forward(attn_metadata=attn_metadata, + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + spec_metadata=spec_metadata, + return_context_logits=return_context_logits, + **kwargs) def load_weights(self, weights: Dict): diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py index da4b54e09e0..5ea69fefb69 100644 --- a/tensorrt_llm/_torch/models/modeling_gpt_oss.py +++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py @@ -182,8 +182,8 @@ def __init__( # Perfect router caching - precompute common logits if enabled if os.environ.get('ENABLE_PERFECT_ROUTER', '0') == '1': precompute_common_perfect_router_logits( - num_experts=pretrained_config.num_experts, - experts_per_token=pretrained_config.experts_per_token, + num_experts=pretrained_config.num_local_experts, + experts_per_token=pretrained_config.num_experts_per_tok, moe_ep_size=config.mapping.moe_ep_size, dtype=pretrained_config.torch_dtype) diff --git a/tensorrt_llm/_torch/models/modeling_hunyuan_moe.py b/tensorrt_llm/_torch/models/modeling_hunyuan_moe.py new file mode 100644 index 00000000000..6ebb6f7e53d --- /dev/null +++ b/tensorrt_llm/_torch/models/modeling_hunyuan_moe.py @@ -0,0 +1,433 @@ +from typing import Dict, Optional, Union + +import torch +from torch import nn +from tqdm import tqdm +from transformers import PretrainedConfig + +from tensorrt_llm._torch.distributed import AllReduceParams +from tensorrt_llm.functional import PositionEmbeddingType + +from ..attention_backend import AttentionMetadata +from ..attention_backend.interface import (PositionalEmbeddingParams, + PredefinedAttentionMask, RopeParams) +from ..model_config import ModelConfig +from ..modules.attention import Attention +from ..modules.decoder_layer import DecoderLayer +from ..modules.embedding import Embedding +from ..modules.fused_moe import (CutlassFusedMoE, RenormalizeMoeRoutingMethod, + VanillaMoE, create_moe) +from ..modules.gated_mlp import GatedMLP +from ..modules.linear import Linear, TensorParallelMode +from ..modules.multi_stream_utils import maybe_execute_in_parallel +from ..modules.rms_norm import RMSNorm +from ..utils import AuxStreamType, Fp4QuantizedTensor +from .modeling_utils import (DecoderModel, DecoderModelForCausalLM, + duplicate_kv_weight, register_auto_model) + + +class HunyuanMoE(nn.Module): + + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + aux_stream: torch.cuda.Stream, + ): + super().__init__() + config = model_config.pretrained_config + self.hidden_dim = config.hidden_size + self.ffn_dim = config.intermediate_size + self.moe_intermediate_size = config.moe_intermediate_size[0] \ + if isinstance(config.moe_intermediate_size, list) else config.moe_intermediate_size + self.num_experts = config.num_experts + self.top_k = config.moe_topk[0] \ + if isinstance(config.moe_topk, list) else config.moe_topk + self.enable_attention_dp = model_config.mapping.enable_attention_dp + + # moe gate (linear layer) only runs in half/full precision for now + self.gate = Linear(self.hidden_dim, + self.num_experts, + bias=False, + dtype=config.torch_dtype) + + reduce_results = True + + self.experts = create_moe( + num_experts=self.num_experts, + routing_method=RenormalizeMoeRoutingMethod(top_k=self.top_k), + hidden_size=self.hidden_dim, + intermediate_size=self.moe_intermediate_size, + aux_stream=aux_stream, + dtype=config.torch_dtype, + reduce_results=reduce_results, + model_config=model_config) + + self.shared_mlp = GatedMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + bias=config.mlp_bias if hasattr(config, 'mlp_bias') else False, + dtype=config.torch_dtype, + config=model_config, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + assert hidden_states.shape[-1] == self.hidden_dim + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, self.hidden_dim) + + shared_expert_output = self.shared_mlp(hidden_states) + all_rank_num_tokens = attn_metadata.all_rank_num_tokens + router_logits = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states, + router_logits, + all_rank_num_tokens=all_rank_num_tokens, + use_dp_padding=False) + + final_hidden_states = shared_expert_output + final_hidden_states + + return final_hidden_states.view(orig_shape) + + +class HunYuanAttention(Attention): + + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + layer_idx: Optional[int] = None, + use_qk_norm: bool = True, + nope_layer: bool = False, + aux_stream: Optional[torch.cuda.Stream] = None, + ): + config = model_config.pretrained_config + + self.use_rope = not nope_layer + pos_embd_params = PositionalEmbeddingParams( + type=PositionEmbeddingType.rope_gpt_neox, + rope=RopeParams.from_config(config), + is_neox=True, + ) if self.use_rope else None + self.use_qk_norm = use_qk_norm + + super().__init__( + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + max_position_embeddings=config.max_position_embeddings, + bias=config.attention_bias, + pos_embd_params=pos_embd_params, + rope_fusion=not self.use_qk_norm, + layer_idx=layer_idx, + dtype=config.torch_dtype, + config=model_config, + ) + + self.head_dim = config.hidden_size // config.num_attention_heads + self.query_layernorm = RMSNorm(hidden_size=self.head_dim, + eps=config.rms_norm_eps, + dtype=config.torch_dtype) + self.key_layernorm = RMSNorm(hidden_size=self.head_dim, + eps=config.rms_norm_eps, + dtype=config.torch_dtype) + self.aux_stream = aux_stream + self.ln_events = [torch.cuda.Event(), torch.cuda.Event()] + + def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor], + v: Optional[torch.Tensor], position_ids: torch.Tensor): + q, k, v = self.split_qkv(q, k, v) + if position_ids is not None: + q, k, v = super().apply_rope(q, k, v, position_ids) + # Llama4 applies QK norm after RoPE. + if self.use_qk_norm: + q, k = self.apply_qk_norm(q, k) + + return q, k, v + + def apply_qk_norm(self, q, k): + + def q_l2norm(): + return self.query_layernorm(q.reshape(-1, self.head_dim)).reshape( + -1, self.q_size) + + def k_l2norm(): + return self.key_layernorm(k.reshape(-1, self.head_dim)).reshape( + -1, self.kv_size) + + q, k = maybe_execute_in_parallel( + q_l2norm, + k_l2norm, + self.ln_events[0], + self.ln_events[1], + self.aux_stream, + ) + + return q, k + + def forward( + self, + position_ids: Optional[torch.IntTensor], + hidden_states: Union[torch.Tensor, Fp4QuantizedTensor], + attn_metadata: AttentionMetadata, + attention_mask: PredefinedAttentionMask = PredefinedAttentionMask. + CAUSAL, + mrope_config: Optional[dict] = None, + all_reduce_params: Optional[AllReduceParams] = None, + lora_params: Optional[dict] = None, + **kwargs, + ) -> torch.Tensor: + assert lora_params is None, "LORA is not supported for HunYuanAttention" + return super().forward( + position_ids=position_ids, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + attention_mask=attention_mask, + mrope_config=mrope_config, + all_reduce_params=all_reduce_params, + lora_params=lora_params, + **kwargs, + ) + + +class HunYuanDecoderLayer(DecoderLayer): + + def __init__(self, model_config: ModelConfig[PretrainedConfig], + layer_idx: int, aux_stream_dict: Dict[AuxStreamType, + torch.cuda.Stream]): + super().__init__() + config = model_config.pretrained_config + self.layer_idx = layer_idx + + # attention + self.self_attn = HunYuanAttention( + model_config, + layer_idx=layer_idx, + ) + + is_experts_valid = ((isinstance(config.num_experts, int) + and config.num_experts > 1) + or (isinstance(config.num_experts, list) + and max(config.num_experts) > 1)) + is_moe_single_node = is_experts_valid and layer_idx >= config.moe_layer_num_skipped # only support one node yet + + if is_moe_single_node: + self.mlp = HunyuanMoE( + model_config, aux_stream_dict[AuxStreamType.MoeChunkingOverlap]) + else: + self.mlp = GatedMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + bias=config.mlp_bias, + dtype=config.torch_dtype, + config=model_config) + + norm_type = getattr(config, 'norm_type', 'rms') + if norm_type == 'hf_rms' or norm_type == 'rms': + self.input_layernorm = RMSNorm(hidden_size=config.hidden_size, + eps=config.rms_norm_eps, + dtype=config.torch_dtype) + self.post_attention_layernorm = RMSNorm( + hidden_size=config.hidden_size, + eps=config.rms_norm_eps, + dtype=config.torch_dtype) + elif norm_type == 'fused' or norm_type == 'torch_nn': + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.rms_norm_eps) + else: + assert False, "other norm_type are not supported" + + def forward( + self, + position_ids: torch.LongTensor, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states = self.self_attn( + position_ids=position_ids, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + **kwargs, + ) + # Fully Connected + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states, attn_metadata) + hidden_states = residual + hidden_states + return hidden_states + + +class HunYuanModel(DecoderModel): + + def __init__(self, model_config: ModelConfig[PretrainedConfig]): + super().__init__(model_config) + config = model_config.pretrained_config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.num_hidden_layers = config.num_hidden_layers + self.aux_stream_dict = { + key: torch.cuda.Stream() + for key in [ + AuxStreamType.Attention, AuxStreamType.MoeShared, + AuxStreamType.MoeChunkingOverlap + ] + } + + self.embed_tokens = Embedding( + config.vocab_size, + config.hidden_size, + dtype=config.torch_dtype, + mapping=model_config.mapping, + tensor_parallel_mode=TensorParallelMode.COLUMN, + gather_output=True, + ) + + self.layers = nn.ModuleList([ + HunYuanDecoderLayer(model_config, layer_idx, self.aux_stream_dict) + for layer_idx in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(hidden_size=config.hidden_size, + eps=config.rms_norm_eps, + dtype=config.torch_dtype) + + def forward( + self, + attn_metadata: AttentionMetadata, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + **kwargs, + ) -> torch.Tensor: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + hidden_states = inputs_embeds + + for layer_idx, decoder_layer in enumerate(self.layers): + kwargs['layer_idx'] = layer_idx + hidden_states = decoder_layer( + position_ids=position_ids, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + **kwargs, + ) + hidden_states = self.norm(hidden_states) + return hidden_states + + +@register_auto_model("HunYuanMoEV1ForCausalLM") +class HunYuanMoEV1ForCausalLM(DecoderModelForCausalLM[HunYuanModel, + PretrainedConfig]): + + def __init__(self, model_config: ModelConfig[PretrainedConfig]): + super().__init__(HunYuanModel(model_config), + config=model_config, + hidden_size=model_config.pretrained_config.hidden_size, + vocab_size=model_config.pretrained_config.vocab_size) + self._execution_stats = None + print("---debug model_config: ", model_config) + + def load_weights(self, weights: Dict): + tp_size = self.model_config.mapping.tp_size + head_dim = self.config.hidden_size // self.config.num_attention_heads + + def filter_weights(prefix, weights: Dict): + result = {} + for k, v in weights.items(): + if k.startswith(prefix): + new_k = k[len(prefix) + 1:] + result[new_k] = v + return result + + params_map = { + 'qkv_proj': ['q_proj', 'k_proj', 'v_proj'], + 'gate_up_proj': ['gate_proj', 'up_proj'] + } + for name, module in tqdm(list(self.named_modules()), + desc="Loading weights"): + if len(module._parameters) > 0: + # skip load weights if tie word embeddings is enabled and layer is lm_head + if self.config.tie_word_embeddings and name.startswith( + "lm_head"): + continue + names = name.split('.') + if names[-1] in params_map: + # model.layers.{idx}.mlp.shared_mlp.gate_up_proj or model.layers.{idx}.self_attn.qkv_proj + module_weights = [] + for new_name in params_map[names[-1]]: + fw = filter_weights('.'.join(names[:-1] + [new_name]), + weights) + if new_name in ['k_proj', 'v_proj']: + fw = { + k: + duplicate_kv_weight( + weight=v[:], + num_kv_heads=v[:].shape[0] // head_dim, + tensor_parallel_size=tp_size) + if k in ["weight", "bias"] else v + for k, v in fw.items() + } + module_weights.append(fw) + module.load_weights(weights=module_weights) + else: + name = name.replace('gate', 'gate.wg') + module_weights = filter_weights(name, weights) + if isinstance(module, CutlassFusedMoE) or isinstance( + module, VanillaMoE): + # model.layers.{idx}.mlp.experts + updated_module_weights = {} + for weight_name, weight_value in module_weights.items(): + new_weight_name = weight_name.replace( + "gate_proj", + "w1").replace("up_proj", + "w3").replace("down_proj", "w2") + updated_module_weights[ + new_weight_name] = weight_value + del module_weights + module.load_weights(weights=[updated_module_weights]) + elif hasattr(module, 'load_weights'): + # model.layers.{idx}.self_attn.o_proj or model.layers.{idx}.mlp.shared_mlp.down_proj + # or model.layers.{idx}.mlp.experts.gate + module.load_weights(weights=[module_weights]) + else: + for n, p in module._parameters.items(): + if p is not None: + p.data.copy_(module_weights[n][:]) + + def forward( + self, + attn_metadata: AttentionMetadata, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + return_context_logits: bool = False, + **kwargs, + ) -> torch.Tensor: + output = self.model( + input_ids=input_ids, + attn_metadata=attn_metadata, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + ) + + return self.logits_processor.forward( + output, + self.lm_head, + attn_metadata, + return_context_logits, + ) diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index e8a57742115..682c55919e0 100644 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -2,7 +2,7 @@ import torch from torch import nn -from transformers import LlamaConfig +from transformers import LlamaConfig, PretrainedConfig from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \ BaseWeightMapper @@ -320,14 +320,45 @@ def apply_eagle3_fc(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -def get_draft_model(model_config, draft_config): +class MTPForCausalLM(nn.Module): + + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + start_layer_idx: int = 0, + lm_head: nn.Module = None, + model: nn.Module = None, + ): + super().__init__() + # Import here to avoid circular import + from .modeling_deepseekv3 import DeepseekV3MTP + + spec_dec_mode = model_config.spec_config.spec_dec_mode + assert spec_dec_mode.is_mtp() + mtp_num_layers = 1 if spec_dec_mode.is_mtp_eagle( + ) else model_config.spec_config.num_nextn_predict_layers + + self.mtp_layers = nn.ModuleList([ + DeepseekV3MTP(model_config, layer_idx + start_layer_idx, + model.aux_stream_dict) + for layer_idx in range(mtp_num_layers) + ]) + self.lm_head = lm_head + self.embed_tokens = model.embed_tokens + + +def get_draft_model(model_config, draft_config, lm_head, model): assert getattr(model_config, 'spec_config', None) != None spec_dec_mode = model_config.spec_config.spec_dec_mode if spec_dec_mode.is_eagle3_one_model(): return Eagle3ForCausalLM( draft_config, model_config.pretrained_config.num_hidden_layers) + elif spec_dec_mode.is_mtp(): + return MTPForCausalLM(model_config, + model_config.pretrained_config.num_hidden_layers, + lm_head, model) else: - raise NotImplemented( + raise NotImplementedError( f"get_draft_model does not support speculative decoding mode {spec_dec_mode}." ) @@ -341,23 +372,24 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]): hidden_size=model_config.pretrained_config.hidden_size, vocab_size=model_config.pretrained_config.vocab_size) self.draft_model = None - if getattr( - model_config, 'spec_config', None - ) and model_config.spec_config.spec_dec_mode.use_one_engine(): - draft_config = ModelConfig.from_pretrained( - model_config.spec_config.speculative_model_dir, - trust_remote_code=True, - attn_backend=model_config.attn_backend, - moe_backend=model_config.moe_backend, - mapping=model_config.mapping, - spec_config=model_config.spec_config, - max_num_tokens=model_config.max_num_tokens, - moe_max_num_tokens=model_config.moe_max_num_tokens) - - draft_config.quant_config.kv_cache_quant_algo = \ + spec_config = getattr(model_config, 'spec_config', None) + if spec_config and spec_config.spec_dec_mode.use_one_engine(): + draft_config = None + if spec_config.spec_dec_mode.is_eagle3_one_model(): + draft_config = ModelConfig.from_pretrained( + model_config.spec_config.speculative_model_dir, + trust_remote_code=True, + attn_backend=model_config.attn_backend, + moe_backend=model_config.moe_backend, + mapping=model_config.mapping, + spec_config=model_config.spec_config, + max_num_tokens=model_config.max_num_tokens, + moe_max_num_tokens=model_config.moe_max_num_tokens) + draft_config.quant_config.kv_cache_quant_algo = \ model_config.quant_config.kv_cache_quant_algo - self.draft_model = get_draft_model(model_config, draft_config) + self.draft_model = get_draft_model(model_config, draft_config, + self.lm_head, self.model) self.spec_worker = get_spec_worker(model_config.spec_config, model_config, model_config.mapping) diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index bfa20eff407..cdba3cd1b0d 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -499,6 +499,11 @@ def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor], q, k = self.rotary_emb(position_ids, [q, k]) return q, k, v + def apply_qk_norm(self, q, k): + raise NotImplementedError( + f"QK norm is not implemented for {self.__class__.__name__}." + "Please override the `apply_qk_norm` method in the subclass.") + @torch.library.custom_op("trtllm::mla_custom_op_inplace", mutates_args=("output", )) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index 2e257c306ae..00a1c494d2a 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -185,6 +185,8 @@ def has_w4afp8(self): @cached_property def enable_alltoall(self): return (self.mapping.moe_ep_size > self.routing_method.experts_per_token + and self.routing_method.experts_per_token % 4 == + 0 # alltoall without allgather only supports top_k % 4 == 0 and self.mapping.enable_attention_dp and self.mapping.tp_size > 1 and os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") != "1" diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 7c9fc125e7d..480aba79bf9 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -8,15 +8,14 @@ import tensorrt_llm import tensorrt_llm.bindings.executor as trtllm from tensorrt_llm._torch.model_config import ModelConfig -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str from tensorrt_llm.bindings.executor import DecodingMode, ExecutorConfig -from tensorrt_llm.llmapi.llm_args import PeftCacheConfig +from tensorrt_llm.llmapi.llm_args import PeftCacheConfig, SamplerType from tensorrt_llm.logger import logger from tensorrt_llm.lora_helper import (LoraConfig, get_default_trtllm_modules_to_hf_modules) from tensorrt_llm.lora_manager import load_torch_lora -from tensorrt_llm.mapping import Mapping +from tensorrt_llm.mapping import CpType, Mapping from ..model_config import ModelConfig from ..speculative import get_num_extra_kv_tokens, get_spec_decoder @@ -25,11 +24,11 @@ from .guided_decoder import GuidedDecoder from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver from .llm_request import ExecutorResponse +from .mamba_cache_manager import MambaHybridCacheManager from .model_engine import PyTorchModelEngine from .py_executor import PyExecutor -from .resource_manager import (KVCacheManager, MambaHybridCacheManager, - PeftCacheManager, ResourceManager, - ResourceManagerType) +from .resource_manager import (KVCacheManager, PeftCacheManager, + ResourceManager, ResourceManagerType) from .sampler import EarlyStopSampler, TorchSampler, TRTLLMSampler from .scheduler import (BindCapacityScheduler, BindMicroBatchScheduler, SimpleScheduler) @@ -589,20 +588,24 @@ def instantiate_sampler(engine: PyTorchModelEngine, mapping, max_seq_len=engine.max_seq_len, enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler) - if mapping.cp_config.get('cp_type') == 'star_attention': + decoding_mode = get_decoding_mode(executor_config) + if mapping.cp_config.get('cp_type') == CpType.STAR: assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'" return TorchSampler(sampler_args) if engine.spec_config is not None and engine.spec_config.spec_dec_mode.has_spec_decoder( ): return get_spec_decoder(sampler_args, engine.spec_config) - if pytorch_backend_config.use_torch_sampler or pytorch_backend_config.enable_mixed_sampler or engine.spec_config is not None: - return TorchSampler(sampler_args) + if pytorch_backend_config.sampler_type == SamplerType.TRTLLMSampler or ( + pytorch_backend_config.sampler_type == SamplerType.auto + and decoding_mode.isBeamSearch()): + logger.debug(f"DecodingMode: {decoding_mode.name}") + return TRTLLMSampler(executor_config, engine.model, engine.dtype, + mapping, decoding_mode, + pytorch_backend_config.disable_overlap_scheduler) if not engine.model.model_config.is_generation: # NOTE: choose sampler based on model type return EarlyStopSampler() - return TRTLLMSampler(executor_config, engine.model, engine.dtype, mapping, - get_decoding_mode(executor_config), - pytorch_backend_config.disable_overlap_scheduler) + return TorchSampler(sampler_args) def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode: @@ -623,90 +626,6 @@ def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode: ) decoding_mode = DecodingMode.TopKTopP() - # Override decoding mode when Medusa is used - if getattr(executor_config.speculative_config, "is_medusa", - False) and not decoding_mode.isMedusa(): - logger.warning( - "Model is Medusa, but decoding mode is not Medusa. Overwriting decoding mode to Medusa." - ) - decoding_mode = DecodingMode.Medusa() - - # Override decoding mode when Medusa is not used - if (not getattr(executor_config.speculative_config, "is_medusa", - False)) and decoding_mode.isMedusa(): - logger.warning( - "Model is not Medusa, but decoding mode is Medusa. Overwriting decoding mode." - ) - if executor_config.max_beam_width == 1: - decoding_mode = DecodingMode.TopKTopP() - else: - decoding_mode = DecodingMode.BeamSearch() - - # Override decoding mode when lookahead decoding is used - if getattr(executor_config.speculative_config, "is_lookahead", - False) and not decoding_mode.isLookahead(): - logger.warning( - "Model is Lookahead, but decoding mode is not Lookahead. Overwriting decoding mode to Lookahead." - ) - decoding_mode = DecodingMode.Lookahead() - - # Override decoding mode when lookahead decoding is not used - if (not getattr(executor_config.speculative_config, "is_lookahead", - False)) and decoding_mode.isLookahead(): - logger.warning( - "Model is not built with Lookahead decoding, but decoding mode is Lookahead. Overwriting decoding mode." - ) - if executor_config.max_beam_width == 1: - decoding_mode = DecodingMode.TopKTopP() - else: - decoding_mode = DecodingMode.BeamSearch() - - # Override decoding mode when 'explicit draft tokens' is used - if getattr(executor_config.speculative_config, "is_explicit_draft_tokens", - False) and not decoding_mode.isExplicitDraftTokens(): - logger.warning( - "Model is built with 'explicit draft tokens' decoding, but decoding mode is something else. Overwriting decoding mode." - ) - decoding_mode = DecodingMode.ExplicitDraftTokens() - - # Override decoding mode when 'explicit draft tokens' is not used - if (not getattr(executor_config.speculative_config, - "is_explicit_draft_tokens", - False)) and decoding_mode.isExplicitDraftTokens(): - logger.warning( - "Model is not built with 'explicit draft tokens' decoding, but decoding mode is set to it. Overwriting decoding mode to default." - ) - if executor_config.max_beam_width == 1: - decoding_mode = DecodingMode.TopKTopP() - else: - decoding_mode = DecodingMode.BeamSearch() - - # Override decoding mode when EAGLE is used - if getattr(executor_config.speculative_config, "is_eagle", - False) and not decoding_mode.isEagle(): - logger.warning( - "Model is Eagle, but decoding mode is not Eagle. Overwriting decoding mode to Eagle." - ) - decoding_mode = DecodingMode.Eagle() - - # Override decoding mode when Eagle is not used - if (not getattr(executor_config.speculative_config, "is_eagle", - False)) and decoding_mode.isEagle(): - logger.warning( - "Model is not Eagle, but decoding mode is Eagle. Overwriting decoding mode." - ) - if executor_config.max_beam_width == 1: - decoding_mode = DecodingMode.TopKTopP() - else: - decoding_mode = DecodingMode.BeamSearch() - - # Override decoding mode when draft tokens are external - if getattr(executor_config.speculative_config, "is_draft_tokens_external", - False): - logger.warning("Overwriting decoding mode to external draft token") - decoding_mode = DecodingMode.ExternalDraftTokens() - - logger.debug(f"DecodingMode: {decoding_mode.name}") return decoding_mode diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py index 226bd0880ee..631f974db26 100644 --- a/tensorrt_llm/_torch/pyexecutor/config.py +++ b/tensorrt_llm/_torch/pyexecutor/config.py @@ -6,7 +6,7 @@ from tensorrt_llm.bindings.executor import ExecutorConfig from ...builder import BuildConfig -from ...llmapi.llm_args import LoadFormat +from ...llmapi.llm_args import LoadFormat, SamplerType from ...logger import logger from ...mapping import Mapping from ..model_config import MoeLoadBalancerConfig @@ -60,9 +60,10 @@ class PyTorchConfig: If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc. """ - use_torch_sampler: bool = False + sampler_type: SamplerType = SamplerType.auto """ - If true, will use the Torch sampler instead of the TRTLLM sampler. + The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. + Defaults to auto, which will use TorchSampler unless BeamSearch is requested. """ kv_cache_dtype: str = "auto" diff --git a/tensorrt_llm/_torch/pyexecutor/config_utils.py b/tensorrt_llm/_torch/pyexecutor/config_utils.py index c0f0482674e..914ec6fcd9b 100644 --- a/tensorrt_llm/_torch/pyexecutor/config_utils.py +++ b/tensorrt_llm/_torch/pyexecutor/config_utils.py @@ -5,9 +5,7 @@ def is_nemotron_hybrid(config): def is_mla(config): - if hasattr(config, "kv_lora_rank"): - assert hasattr( - config, "qk_rope_head_dim" - ), "both of kv_lora_rank and qk_rope_head_dim are required." + if getattr(config, "kv_lora_rank", None) and getattr( + config, "qk_rope_head_dim", None): return True return False diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py index 6f072f90bef..17ba4983b76 100644 --- a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py +++ b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py @@ -11,6 +11,7 @@ import torch from tensorrt_llm._utils import nvtx_range +from tensorrt_llm.mapping import CpType from ..distributed import Distributed from .llm_request import (ExecutorRequest, LlmRequest, @@ -569,9 +570,9 @@ def _merge_requests( cp_config = self.dist.cp_config if 'cp_type' in cp_config: cp_type = cp_config['cp_type'] - if cp_type == 'star_attention': + if cp_type == CpType.STAR: return self._merge_star_attention_requests(new_requests) - elif cp_type == 'ring_attention': + elif cp_type == CpType.RING: raise NotImplementedError("ring attention not implemented yet") else: raise NotImplementedError(f'unsupport cp type {cp_type}') diff --git a/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py new file mode 100644 index 00000000000..707fdf33fbe --- /dev/null +++ b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py @@ -0,0 +1,246 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Union + +import torch + +from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest +from tensorrt_llm._torch.pyexecutor.resource_manager import ( + BaseResourceManager, CacheTypeCpp, DataType, KvCacheConfigCpp, + KVCacheManager, get_pp_layers) +from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests +from tensorrt_llm.mapping import Mapping + + +class MambaCacheManager(BaseResourceManager): + + def __init__( + self, + d_state: int, + d_conv: int, + num_heads: int, + n_groups: int, + head_dim: int, + num_layers: int, + max_batch_size: int, + mapping: Mapping, + dtype: torch.dtype, + ssm_cache_dtype: torch.dtype, + layer_mask: Optional[List[bool]] = None, + ) -> None: + + self.mamba_ssm_cache_dtype = ssm_cache_dtype + + # get tp size + tp_size = mapping.tp_size + + # derive mamba parameters for conv and ssm states + d_inner = head_dim * num_heads + conv_dim = d_inner + 2 * n_groups * d_state + nheads = num_heads + + # check that can be partitioned + assert nheads % tp_size == 0, "nheads must be divisible by tp_size" + assert conv_dim % tp_size == 0, "conv_dim must be divisible by tp_size" + + # partition conv_dim and nheads + conv_dim = conv_dim // tp_size + nheads = nheads // tp_size + + # conv and ssm states device + device = torch.device("cuda") + + pp_layers, num_layers = get_pp_layers( + num_layers, + mapping, + layer_mask=layer_mask, + ) + num_local_layers = len(pp_layers) + self.mamba_layer_offsets = { + idx: offset + for offset, idx in enumerate(pp_layers) + } + + # mamba conv states + self.conv_states = torch.empty( + size=[ + num_local_layers, + max_batch_size, + conv_dim, + d_conv - 1, + ], + dtype=dtype, + device=device, + ) + + # mamba ssm states + self.ssm_states = torch.empty( + size=[ + num_local_layers, + max_batch_size, + nheads, + head_dim, + d_state, + ], + dtype=self.mamba_ssm_cache_dtype, + device=device, + ) + + # mamba cache available blocks + self.mamba_cache_free_blocks = [i for i in range(max_batch_size)] + + # mamba cache index, maps request_id -> state indices + self.mamba_cache_index: Dict[int, int] = {} + + # mamba cache state indices + self.state_indices: torch.Tensor = torch.arange(max_batch_size, + device=device, + dtype=torch.int32) + + def _prepare_mamba_cache_blocks(self, request_ids: List[int]): + state_indices = [] + for r in request_ids: + # cache hit + if r in self.mamba_cache_index: + state_indices.append(self.mamba_cache_index[r]) + # cache miss + else: + if len(self.mamba_cache_free_blocks) == 0: + raise Exception("run out of mamba cache blocks") + block = self.mamba_cache_free_blocks.pop() + self.mamba_cache_index[r] = block + state_indices.append(block) + self.state_indices[:len(state_indices)] = torch.as_tensor( + state_indices, dtype=torch.int32, device=self.ssm_states.device) + + def prepare_resources(self, scheduled_batch: ScheduledRequests): + context_ids = [ + i.py_request_id for i in scheduled_batch.context_requests + ] + generation_ids = [ + i.py_request_id for i in scheduled_batch.generation_requests + ] + request_ids = context_ids + generation_ids + self._prepare_mamba_cache_blocks(request_ids) + + def free_resources(self, request: LlmRequest): + request_id = request.py_request_id + if request_id in self.mamba_cache_index: + block = self.mamba_cache_index.pop(request_id) + self.mamba_cache_free_blocks.append(block) + + def get_state_indices(self) -> torch.Tensor: + return self.state_indices + + def get_conv_states(self, layer_idx: int) -> torch.Tensor: + layer_offset = self.mamba_layer_offsets[layer_idx] + return self.conv_states[layer_offset] + + def get_ssm_states(self, layer_idx: int) -> torch.Tensor: + layer_offset = self.mamba_layer_offsets[layer_idx] + return self.ssm_states[layer_offset] + + def get_mamba_ssm_cache_dtype(self) -> torch.dtype: + return self.mamba_ssm_cache_dtype + + def shutdown(self): + # release tensor memory, keeping python references as tensors + self.conv_states = torch.tensor([]) + self.ssm_states = torch.tensor([]) + self.state_indices = torch.tensor([]) + torch.cuda.empty_cache() + + +class MambaHybridCacheManager(KVCacheManager, MambaCacheManager): + + def __init__( + self, + # mamba cache parameters + mamba_d_state: int, + mamba_d_conv: int, + mamba_num_heads: int, + mamba_n_groups: int, + mamba_head_dim: int, + mamba_num_layers: int, + mamba_layer_mask: List[bool], + mamba_cache_dtype: torch.dtype, + mamba_ssm_cache_dtype: torch.dtype, + + # kv cache parameters + kv_cache_config: KvCacheConfigCpp, + kv_cache_type: CacheTypeCpp, + *, + num_layers: int, + layer_mask: List[bool], + num_kv_heads: Union[int, List[Optional[int]]], + head_dim: int, + tokens_per_block: int, + # Note that max_seq_len is not necessarily equal to kv_cache_config.num_tokens. + # It's derived from the model's BuildConfig for consistency with the C++ backend. + max_seq_len: int, + max_batch_size: int, + mapping: Mapping, + dtype: DataType = DataType.HALF, + spec_config: Optional["DecodingBaseConfig"] = None, + ) -> None: + + # mamba hybrid cache requires block reuse to be disabled in KV cache config + assert not kv_cache_config.enable_block_reuse, "mamba hybrid cache requires block reuse to be disabled in KV cache config" + + # initialize mamba cache manager + MambaCacheManager.__init__( + self, + mamba_d_state, + mamba_d_conv, + mamba_num_heads, + mamba_n_groups, + mamba_head_dim, + mamba_num_layers, + max_batch_size, + mapping, + mamba_cache_dtype, + mamba_ssm_cache_dtype, + mamba_layer_mask, + ) + + # initialize kv cache manager + KVCacheManager.__init__( + self, + kv_cache_config, + kv_cache_type, + num_layers=num_layers, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + tokens_per_block=tokens_per_block, + max_seq_len=max_seq_len, + max_batch_size=max_batch_size, + mapping=mapping, + dtype=dtype, + spec_config=spec_config, + layer_mask=layer_mask, + ) + + def prepare_resources(self, scheduled_batch: ScheduledRequests): + MambaCacheManager.prepare_resources(self, scheduled_batch) + KVCacheManager.prepare_resources(self, scheduled_batch) + + def free_resources(self, request: LlmRequest): + MambaCacheManager.free_resources(self, request) + KVCacheManager.free_resources(self, request) + + def shutdown(self): + MambaCacheManager.shutdown(self) + KVCacheManager.shutdown(self) diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index b4380f16d4f..22a53c4666f 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -29,7 +29,7 @@ from tensorrt_llm.logger import logger from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.lora_manager import LoraModelConfig -from tensorrt_llm.mapping import Mapping +from tensorrt_llm.mapping import CpType, Mapping from tensorrt_llm.models.modeling_utils import QuantAlgo from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2 @@ -666,7 +666,7 @@ def release_batch(result: ScheduledRequests | None): # TODO: current warmup_request is not suitable for star attention cp_type = self.mapping.cp_config.get('cp_type', None) - if cp_type == 'star_attention': + if cp_type == CpType.STAR: return with contextlib.ExitStack() as stack: @@ -2110,7 +2110,7 @@ def _prepare_inputs( cache_indirection_buffer: Optional[torch.Tensor] = None): if self.mapping is not None and 'cp_type' in self.mapping.cp_config: cp_type = self.mapping.cp_config['cp_type'] - if 'star_attention' == cp_type: + if CpType.STAR == cp_type: return self._prepare_star_attention_inputs( scheduled_requests, kv_cache_manager, attn_metadata) else: diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index b93ec028a1b..d4886887cef 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -31,6 +31,7 @@ from tensorrt_llm.bindings.internal.batch_manager import (LlmRequestType, ReqIdsSet) from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import CpType from tensorrt_llm.runtime.generation import CUASSERT from ..distributed import Distributed @@ -1460,7 +1461,7 @@ def _update_request_states(self, scheduled_requests: ScheduledRequests): cp_config = self.dist.cp_config if 'cp_type' in cp_config: cp_type = cp_config['cp_type'] - if cp_type == 'star_attention': + if cp_type == CpType.STAR: self._update_request_states_star_attention(scheduled_requests) else: assert False, f'Unsupport cp_type {cp_type}' diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 7729e3510ea..9a5b42166dc 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -16,7 +16,7 @@ from ..._utils import binding_dtype_size, binding_to_str_dtype, nvtx_range from ...logger import logger -from ...mapping import Mapping +from ...mapping import CpType, Mapping from .llm_request import (LlmRequest, LlmRequestState, SamplingConfig, get_draft_token_length) from .scheduler import ScheduledRequests @@ -402,7 +402,7 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests): # allocate KV Cache for req in context_batch: req_beam_width = req.sampling_config.beam_width - if 'cp_type' in self.mapping.cp_config and 'star_attention' == self.mapping.cp_config[ + if 'cp_type' in self.mapping.cp_config and CpType.STAR == self.mapping.cp_config[ 'cp_type']: if req.ctx_iters == 0: seq_len = sum( @@ -931,227 +931,6 @@ def _set_temp_attention_window_inputs( return None -class MambaCacheManager(BaseResourceManager): - - def __init__( - self, - d_state: int, - d_conv: int, - num_heads: int, - n_groups: int, - head_dim: int, - num_layers: int, - max_batch_size: int, - mapping: Mapping, - dtype: torch.dtype, - ssm_cache_dtype: torch.dtype, - layer_mask: Optional[List[bool]] = None, - ) -> None: - - self.mamba_ssm_cache_dtype = ssm_cache_dtype - - # get tp size - tp_size = mapping.tp_size - - # derive mamba parameters for conv and ssm states - d_inner = head_dim * num_heads - conv_dim = d_inner + 2 * n_groups * d_state - nheads = num_heads - - # check that can be partitioned - assert nheads % tp_size == 0, "nheads must be divisible by tp_size" - assert conv_dim % tp_size == 0, "conv_dim must be divisible by tp_size" - - # partition conv_dim and nheads - conv_dim = conv_dim // tp_size - nheads = nheads // tp_size - - # conv and ssm states device - device = torch.device("cuda") - - pp_layers, num_layers = get_pp_layers( - num_layers, - mapping, - layer_mask=layer_mask, - ) - num_local_layers = len(pp_layers) - self.mamba_layer_offsets = { - idx: offset - for offset, idx in enumerate(pp_layers) - } - - # mamba conv states - self.conv_states = torch.empty( - size=[ - num_local_layers, - max_batch_size, - conv_dim, - d_conv - 1, - ], - dtype=dtype, - device=device, - ) - - # mamba ssm states - self.ssm_states = torch.empty( - size=[ - num_local_layers, - max_batch_size, - nheads, - head_dim, - d_state, - ], - dtype=self.mamba_ssm_cache_dtype, - device=device, - ) - - # mamba cache available blocks - self.mamba_cache_free_blocks = [i for i in range(max_batch_size)] - - # mamba cache index, maps request_id -> state indices - self.mamba_cache_index: Dict[int, int] = {} - - # mamba cache state indices - self.state_indices: torch.Tensor = torch.arange(max_batch_size, - device=device, - dtype=torch.int32) - - def _prepare_mamba_cache_blocks(self, request_ids: List[int]): - state_indices = [] - for r in request_ids: - # cache hit - if r in self.mamba_cache_index: - state_indices.append(self.mamba_cache_index[r]) - # cache miss - else: - if len(self.mamba_cache_free_blocks) == 0: - raise Exception("run out of mamba cache blocks") - block = self.mamba_cache_free_blocks.pop() - self.mamba_cache_index[r] = block - state_indices.append(block) - self.state_indices[:len(state_indices)] = torch.as_tensor( - state_indices, dtype=torch.int32, device=self.ssm_states.device) - - def prepare_resources(self, scheduled_batch: ScheduledRequests): - context_ids = [ - i.py_request_id for i in scheduled_batch.context_requests - ] - generation_ids = [ - i.py_request_id for i in scheduled_batch.generation_requests - ] - request_ids = context_ids + generation_ids - self._prepare_mamba_cache_blocks(request_ids) - - def free_resources(self, request: LlmRequest): - request_id = request.py_request_id - if request_id in self.mamba_cache_index: - block = self.mamba_cache_index.pop(request_id) - self.mamba_cache_free_blocks.append(block) - - def get_state_indices(self) -> torch.Tensor: - return self.state_indices - - def get_conv_states(self, layer_idx: int) -> torch.Tensor: - layer_offset = self.mamba_layer_offsets[layer_idx] - return self.conv_states[layer_offset] - - def get_ssm_states(self, layer_idx: int) -> torch.Tensor: - layer_offset = self.mamba_layer_offsets[layer_idx] - return self.ssm_states[layer_offset] - - def get_mamba_ssm_cache_dtype(self) -> torch.dtype: - return self.mamba_ssm_cache_dtype - - def shutdown(self): - # release tensor memory, keeping python references as tensors - self.conv_states = torch.tensor([]) - self.ssm_states = torch.tensor([]) - self.state_indices = torch.tensor([]) - torch.cuda.empty_cache() - - -class MambaHybridCacheManager(KVCacheManager, MambaCacheManager): - - def __init__( - self, - # mamba cache parameters - mamba_d_state: int, - mamba_d_conv: int, - mamba_num_heads: int, - mamba_n_groups: int, - mamba_head_dim: int, - mamba_num_layers: int, - mamba_layer_mask: List[bool], - mamba_cache_dtype: torch.dtype, - mamba_ssm_cache_dtype: torch.dtype, - - # kv cache parameters - kv_cache_config: KvCacheConfigCpp, - kv_cache_type: CacheTypeCpp, - *, - num_layers: int, - layer_mask: List[bool], - num_kv_heads: Union[int, List[Optional[int]]], - head_dim: int, - tokens_per_block: int, - # Note that max_seq_len is not necessarily equal to kv_cache_config.num_tokens. - # It's derived from the model's BuildConfig for consistency with the C++ backend. - max_seq_len: int, - max_batch_size: int, - mapping: Mapping, - dtype: DataType = DataType.HALF, - spec_config: Optional["DecodingBaseConfig"] = None, - ) -> None: - - # mamba hybrid cache requires block reuse to be disabled in KV cache config - assert not kv_cache_config.enable_block_reuse, "mamba hybrid cache requires block reuse to be disabled in KV cache config" - - # initialize mamba cache manager - MambaCacheManager.__init__( - self, - mamba_d_state, - mamba_d_conv, - mamba_num_heads, - mamba_n_groups, - mamba_head_dim, - mamba_num_layers, - max_batch_size, - mapping, - mamba_cache_dtype, - mamba_ssm_cache_dtype, - mamba_layer_mask, - ) - - # initialize kv cache manager - KVCacheManager.__init__( - self, - kv_cache_config, - kv_cache_type, - num_layers=num_layers, - num_kv_heads=num_kv_heads, - head_dim=head_dim, - tokens_per_block=tokens_per_block, - max_seq_len=max_seq_len, - max_batch_size=max_batch_size, - mapping=mapping, - dtype=dtype, - spec_config=spec_config, - layer_mask=layer_mask, - ) - - def prepare_resources(self, scheduled_batch: ScheduledRequests): - MambaCacheManager.prepare_resources(self, scheduled_batch) - KVCacheManager.prepare_resources(self, scheduled_batch) - - def free_resources(self, request: LlmRequest): - MambaCacheManager.free_resources(self, request) - KVCacheManager.free_resources(self, request) - - def shutdown(self): - MambaCacheManager.shutdown(self) - KVCacheManager.shutdown(self) - - class SlotManager: def __init__(self, max_num_requests: int): diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index 7abb97f3b42..f7cdd92a561 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -23,6 +23,9 @@ class SpeculativeDecodingMode(IntEnum): def is_mtp(self): return self == SpeculativeDecodingMode.MTP or self == SpeculativeDecodingMode.MTP_EAGLE + def is_mtp_vanilla(self): + return self == SpeculativeDecodingMode.MTP + def is_mtp_eagle(self): return self == SpeculativeDecodingMode.MTP_EAGLE @@ -88,11 +91,13 @@ def extend_ctx(self, attention_backend: Type[AttentionBackend]): any spec dec mode that uses the SpecExecutor. """ - # Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell. - return ((self.is_eagle3() or self.is_draft_target()) - and not (issubclass(attention_backend, TrtllmAttention) - and get_sm_version() == 100) - ) or self.is_ngram() or self.is_user_provided() + if self.use_one_engine(): + # 1-model has separate logic for handling draft tokens + return False + + # The special XQA generation kernels only exist with the TRTLLM backend on blackwell. + return not issubclass(attention_backend, + TrtllmAttention) or get_sm_version() != 100 def attention_need_spec_dec_mode(self): """ diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py index 1772125bcbf..2658ce539b5 100644 --- a/tensorrt_llm/_torch/speculative/mtp.py +++ b/tensorrt_llm/_torch/speculative/mtp.py @@ -330,11 +330,9 @@ def forward( position_ids, hidden_states, logits, - lm_head, - embed_tokens, attn_metadata, spec_metadata, - mtp_layers, + draft_model, ): ''' Example: @@ -470,9 +468,10 @@ def forward( next_draft_tokens = [] last_tokens_idx = torch.cumsum( attn_metadata.seq_lens_cuda, dim=0, dtype=torch.long) - 1 - for _, mtp_layer in enumerate(mtp_layers): - hidden_states = mtp_layer(embed_tokens=embed_tokens, **draft_inputs) - logits = mtp_layer.shared_head(hidden_states, lm_head, + for _, mtp_layer in enumerate(draft_model.mtp_layers): + hidden_states = mtp_layer(embed_tokens=draft_model.embed_tokens, + **draft_inputs) + logits = mtp_layer.shared_head(hidden_states, draft_model.lm_head, attn_metadata).float() new_draft_token = self.draft_sampler(logits) next_draft_tokens.append(new_draft_token) @@ -517,11 +516,9 @@ def skip_forward( position_ids, hidden_states, logits, - lm_head, - embed_tokens, attn_metadata, spec_metadata, - mtp_layers, + draft_model, ): batch_size = attn_metadata.num_seqs mtp_num_modules = self.spec_config.num_nextn_predict_layers @@ -1127,11 +1124,9 @@ def forward( position_ids, hidden_states, logits, - lm_head, - embed_tokens, attn_metadata, spec_metadata, - mtp_layers, + draft_model, ): batch_size = attn_metadata.num_seqs num_contexts = attn_metadata.num_contexts @@ -1172,8 +1167,8 @@ def prepare_position_ids_and_last_tokens(position_ids, attn_metadata): next_draft_tokens = [] for i in range(self.mtp_num_modules): if i == 0: - hidden_states = mtp_layers[0]( - embed_tokens=embed_tokens, + hidden_states = draft_model.mtp_layers[0]( + embed_tokens=draft_model.embed_tokens, all_rank_num_tokens=spec_metadata.all_rank_num_tokens, all_rank_max_num_tokens=spec_metadata. all_rank_max_num_tokens, @@ -1186,8 +1181,8 @@ def prepare_position_ids_and_last_tokens(position_ids, attn_metadata): gather_ids = torch.concat( [last_tokens_idx[:num_contexts], gather_ids_gen], dim=0) else: - hidden_states = mtp_layers[0]( - embed_tokens=embed_tokens, + hidden_states = draft_model.mtp_layers[0]( + embed_tokens=draft_model.embed_tokens, all_rank_num_tokens=spec_metadata. subseq_all_rank_num_tokens, all_rank_max_num_tokens=max( @@ -1197,8 +1192,9 @@ def prepare_position_ids_and_last_tokens(position_ids, attn_metadata): **inputs) # All of the seq_len are 1, use batch_indices_cuda as gather_ids gather_ids = spec_metadata.batch_indices_cuda[:batch_size] - logits = mtp_layers[0].shared_head(hidden_states[gather_ids], - lm_head, attn_metadata, True) + logits = draft_model.mtp_layers[0].shared_head( + hidden_states[gather_ids], draft_model.lm_head, attn_metadata, + True) new_draft_token = self.draft_sampler(logits) hidden_states, position_ids = self.update_draft_tokens( diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py index ad7fbf8fd56..c4a4ccf7e3c 100644 --- a/tensorrt_llm/_torch/speculative/utils.py +++ b/tensorrt_llm/_torch/speculative/utils.py @@ -154,11 +154,12 @@ def get_num_spec_layers(spec_config): def get_spec_worker(spec_config, model_config, mapping): - if spec_config.spec_dec_mode.is_mtp(): + spec_dec_mode = spec_config.spec_dec_mode + if spec_dec_mode.is_mtp_vanilla(): return MTPWorker(spec_config, model_config) - if spec_config.spec_dec_mode.is_mtp_eagle(): + if spec_dec_mode.is_mtp_eagle(): return MTPEagleWorker(spec_config, model_config) - if spec_config.spec_dec_mode.is_eagle3_one_model(): + if spec_dec_mode.is_eagle3_one_model(): return Eagle3OneModelWorker(spec_config, mapping) return None diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index 2184919465c..18d7980ea4d 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -515,10 +515,10 @@ def ignore_trt_only_args(kwargs: dict): report_utility.report_statistics() except KeyboardInterrupt: logger.info("Keyboard interrupt, exiting benchmark...") - sys.exit(130) - except Exception as e: - logger.error(f"Error during benchmarking: {e}") - sys.exit(-1) + except Exception: + import traceback + logger.error(f"Error during benchmarking:\n{traceback.format_exc()}") + sys.exit(1) finally: if llm is not None: llm.shutdown() diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 75f497b29bb..acf7f60bcbb 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -318,7 +318,7 @@ def get_statistics_dict(self) -> Dict[str, Any]: "backend": "Pytorch", "dtype": - torch_dtype_to_str(model_config.pretrained_config.torch_dtype + torch_dtype_to_str(model_config.torch_dtype or model_config.pretrained_config. get_text_config().torch_dtype), "kv_cache_dtype": diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 0cf7302fbee..2492eb6a61b 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -4734,6 +4734,15 @@ def create_sinusoidal_positions_for_attention_plugin( inv_freq = 1.0 / (theta**(np.arange(0, dim, 2) / dim)).astype(dtype) inv_freq = RopeEmbeddingUtils.apply_llama3_scaling( inv_freq, rope_scaling_config) + elif scale_type == RotaryScalingType.dynamic: + # Make sure scaling_alpha exists in rope_scaling + # Ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct-FP8/blob/main/modeling_hunyuan.py#L346 + assert rope_scaling_config[ + "alpha"] is not None, "rope_scaling_config.alpha must be provided." + scaling_alpha = rope_scaling_config["alpha"] + adjusted_base = theta * (scaling_alpha**(dim / (dim - 2))) + inv_freq = 1.0 / (adjusted_base**( + np.arange(0, dim, 2, dtype=dtype) / dim)).astype(dtype) else: inv_freq = scale / (theta **(np.arange(0, dim, 2) / dim)).astype(dtype) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 0f377657261..abc41b00356 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1968,6 +1968,13 @@ class LoadFormat(Enum): DUMMY = 1 +class SamplerType(StrEnum): + """Enum for sampler type options.""" + TRTLLMSampler = "TRTLLMSampler" + TorchSampler = "TorchSampler" + auto = "auto" + + class TorchCompileConfig(StrictBaseModel): """ Configuration for torch.compile. @@ -2055,11 +2062,11 @@ class TorchLlmArgs(BaseLlmArgs): "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc.", status="beta") - use_torch_sampler: bool = Field( - default=False, + sampler_type: Union[str, SamplerType] = Field( + default=SamplerType.auto, description= - "If true, will use the Torch sampler instead of the TRTLLM sampler.", - status="beta") + "The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. Defaults to auto, which will use TorchSampler unless BeamSearch is requested.", + status="prototype") enable_iter_perf_stats: bool = Field( default=False, @@ -2344,7 +2351,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig": attn_backend=self.attn_backend, moe_backend=self.moe_config.backend, enable_mixed_sampler=self.enable_mixed_sampler, - use_torch_sampler=self.use_torch_sampler, + sampler_type=self.sampler_type, kv_cache_dtype=self.kv_cache_config.dtype, mamba_ssm_cache_dtype=self.kv_cache_config.mamba_ssm_cache_dtype, enable_iter_perf_stats=self.enable_iter_perf_stats, diff --git a/tensorrt_llm/llmapi/tokenizer.py b/tensorrt_llm/llmapi/tokenizer.py index c006169be7b..7e13643fb82 100644 --- a/tensorrt_llm/llmapi/tokenizer.py +++ b/tensorrt_llm/llmapi/tokenizer.py @@ -57,6 +57,11 @@ def decode(self, token_ids: List[int], *args, **kwargs) -> str: def batch_encode_plus(self, texts: List[str], *args, **kwargs) -> dict: return self.tokenizer.batch_encode_plus(texts, *args, **kwargs) + def get_chat_template(self, + chat_template: Optional[str] = None, + tools: Optional[List[Dict]] = None) -> str: + return self.tokenizer.get_chat_template(chat_template, tools) + def apply_chat_template( self, conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]], *args, @@ -353,5 +358,8 @@ def load_hf_tokenizer(model_dir: str, use_fast=use_fast, **kwargs) - except Exception: + except Exception as e: + logger.warning( + f"Failed to load hf tokenizer from {model_dir}, encounter error: {e}" + ) return None diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py index 8b2e516dba2..65000841909 100644 --- a/tensorrt_llm/llmapi/utils.py +++ b/tensorrt_llm/llmapi/utils.py @@ -3,6 +3,7 @@ import hashlib import io import os +import re import sys import tempfile import threading @@ -508,8 +509,10 @@ def generate_api_docs_as_docstring(model: Type[BaseModel], type_str = str(type_hints[field_name]) type_str = type_str.replace("typing.", "") # Extract just the class name from full class path - if "", r""]: + if (match := re.match(regex, type_str)) is not None: + type_str = match.group(1) + break else: type_str = field_type or 'Any' diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py index 22824ea350d..cfc997b786a 100644 --- a/tensorrt_llm/mapping.py +++ b/tensorrt_llm/mapping.py @@ -12,11 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from enum import IntEnum from typing import List import torch +class CpType(IntEnum): + # CP type for ulysses parallelism + ULYSSES = 0 + # CP type for star attention + STAR = 1 + # CP type for ring attention + RING = 2 + # CP type for helix parallelism + HELIX = 3 + + class Mapping(object): ''' A node with 8 GPUs, tp_size = 4, cp_size = 1, pp_size = 2 @@ -135,58 +147,70 @@ def __init__( if moe_cluster_size == -1: moe_cluster_size = 1 + cp_type = CpType.ULYSSES if cp_config is None else cp_config.get( + "cp_type", CpType.ULYSSES) + moe_world_size = tp_size if cp_type == CpType.ULYSSES else tp_size * cp_size + if moe_tp_size == -1 and moe_ep_size == -1: - moe_tp_size = tp_size // moe_cluster_size + moe_tp_size = moe_world_size // moe_cluster_size moe_ep_size = 1 elif moe_tp_size == -1: - moe_tp_size = tp_size // (moe_ep_size * moe_cluster_size) + moe_tp_size = moe_world_size // (moe_ep_size * moe_cluster_size) elif moe_ep_size == -1: - moe_ep_size = tp_size // (moe_tp_size * moe_cluster_size) + moe_ep_size = moe_world_size // (moe_tp_size * moe_cluster_size) if attn_tp_size == -1 and attn_cp_size == -1: - # fallback to ulysses - attn_tp_size = tp_size * cp_size - attn_cp_size = 1 + if cp_type == CpType.ULYSSES: + # fallback to ulysses + attn_tp_size = tp_size * cp_size + attn_cp_size = 1 + else: + # fallback to helix + attn_tp_size = tp_size + attn_cp_size = cp_size elif attn_tp_size == -1: - attn_tp_size = cp_size * tp_size // attn_cp_size + attn_tp_size = (tp_size * cp_size) // attn_cp_size elif attn_cp_size == -1: - attn_cp_size = cp_size * tp_size // attn_tp_size + attn_cp_size = (tp_size * cp_size) // attn_tp_size - if attn_cp_size != 1: + if attn_cp_size != 1 and cp_type == CpType.ULYSSES: raise ValueError( - f"attn_cp_size must be 1 for now, but got {attn_tp_size}, {attn_cp_size}." + f"attn_cp_size must be 1 for now for ulysses, but got {attn_tp_size}, {attn_cp_size}." ) if auto_parallel: - if tp_size != 1 or pp_size != 1 or tp_size != 1: + if tp_size != 1 or pp_size != 1 or cp_size != 1: raise ValueError( - f"When auto parallel is enabled, tp_size, pp_size, cp_size must be 1, but got {tp_size}, {pp_size}, {cp_size}." - ) + "When auto parallel is enabled, tp_size, pp_size, cp_size must be 1, " + f"but got {tp_size}, {pp_size}, {cp_size}.") else: if tp_size * pp_size * cp_size != world_size: raise ValueError( - f"world_size must equal to tp_size * pp_size * cp_size, but got {world_size} != {tp_size} * {pp_size} * {cp_size}." + "world_size must equal to tp_size * pp_size * cp_size, " + f"but got {world_size} != {tp_size} * {pp_size} * {cp_size}." ) moe_tp_ep_size = moe_tp_size * moe_ep_size moe_tp_cluster_ep_size = moe_tp_ep_size * moe_cluster_size - if moe_tp_cluster_ep_size != tp_size: + if moe_tp_cluster_ep_size != moe_world_size: raise ValueError( - f"tp_size must equal to moe_tp_size * moe_ep_size * moe_cluster_size, but got {tp_size} != {moe_tp_size} * {moe_ep_size} * {moe_cluster_size}" - ) + "moe_tp_size * moe_ep_size * moe_cluster_size must equal to moe_world_size, " + f"but got {moe_tp_cluster_ep_size} != {moe_world_size}") attn_tp_cp_size = attn_tp_size * attn_cp_size if attn_tp_cp_size != tp_size * cp_size: raise ValueError( - f"tp_size * cp_size must equal to attn_tp_size * attn_cp_size, but got {tp_size} * {cp_size} != {attn_tp_size} * {attn_cp_size}" + "tp_size * cp_size must equal to attn_tp_size * attn_cp_size, " + f"but got {tp_size} * {cp_size} != {attn_tp_size} * {attn_cp_size}" ) - if moe_ep_size != 1 and cp_size > 1: - raise NotImplementedError("CP don't support MoE tp/ep yet") + if moe_ep_size != 1 and cp_size > 1 and cp_type != CpType.HELIX: + raise NotImplementedError( + f"CP {cp_type} doesn't support MoE tp/ep yet") self.tp_size = tp_size self.cp_size = cp_size @@ -275,6 +299,7 @@ def __eq__(self, other): and self.moe_ep_size == other.moe_ep_size and self.attn_tp_size == other.attn_tp_size and self.attn_cp_size == other.attn_cp_size + and self.cp_config == other.cp_config and self.auto_parallel == other.auto_parallel) def __hash__(self): @@ -290,6 +315,8 @@ def __hash__(self): self.moe_ep_size, self.attn_tp_size, self.attn_cp_size, + # note: we do not allow updating cp_config after initialization + tuple(sorted(self.cp_config.items())), self.auto_parallel, )) @@ -376,8 +403,13 @@ def local_rank(self): def dp_size(self): return self.tp_size if self.enable_attention_dp else 1 - def has_cp(self): - return self.cp_size > 1 + def has_cp_ulysses(self): + return self.cp_size > 1 and self.cp_config.get( + "cp_type") == CpType.ULYSSES + + def has_cp_helix(self): + return self.cp_size > 1 and self.cp_config.get( + "cp_type") == CpType.HELIX def get_node_rank(self, rank: int): return rank // self.gpus_per_node @@ -415,6 +447,29 @@ def next_pp_rank(self): p = p - self.world_size return p + def is_last_cp_rank(self): + return self.cp_rank == self.cp_size - 1 + + def is_first_cp_rank(self): + return self.cp_rank == 0 + + def has_cp(self): + return self.cp_size > 1 + + def prev_cp_rank(self): + p = self.rank - self.tp_size + if p // (self.tp_size * self.cp_size) < self.rank // (self.tp_size * + self.cp_size): + return p + self.tp_size * self.cp_size + return p + + def next_cp_rank(self): + p = self.rank + self.tp_size + if p // (self.tp_size * self.cp_size) > self.rank // (self.tp_size * + self.cp_size): + return p - self.tp_size * self.cp_size + return p + def has_moe_cluster(self): return self.moe_cluster_size > 1 @@ -453,5 +508,6 @@ def to_dict(self): 'moe_ep_size': self.moe_ep_size, 'attn_tp_size': self.attn_tp_size, 'attn_cp_size': self.attn_cp_size, + 'cp_config': self.cp_config, 'auto_parallel': self.auto_parallel, } diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 547dbddc104..051c5401a06 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -574,13 +574,13 @@ def test_nixl_backend(self): ctx_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } gen_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } disaggregated_server_config = { @@ -710,13 +710,13 @@ def test_nixl_backend(self): ctx_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } gen_server_config = { "disable_overlap_scheduler": True, "cache_transceiver_config": { - "backend": "nixl" + "backend": "NIXL" } } disaggregated_server_config = { diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 9cb504d5b13..889733057b1 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -196,7 +196,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend, @skip_pre_hopper def test_fp8_llm_sampler(self): model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" - with LLM(model_path, use_torch_sampler=True, max_batch_size=256) as llm: + with LLM(model_path, max_batch_size=256) as llm: assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 sampling_params = SamplingParams( @@ -229,7 +229,6 @@ def test_fp8_beam_search(self): max_beam_width=max_beam_width, max_batch_size=16, max_seq_len=1024, - use_torch_sampler=False, build_config=None) with llm: @@ -2011,8 +2010,7 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, - cuda_graph_config=CudaGraphConfig() if cuda_graph else None, - use_torch_sampler=True) + cuda_graph_config=CudaGraphConfig() if cuda_graph else None) with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8", tensor_parallel_size=tp_size, @@ -2034,8 +2032,7 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, - cuda_graph_config=CudaGraphConfig() if cuda_graph else None, - use_torch_sampler=True) + cuda_graph_config=CudaGraphConfig() if cuda_graph else None) with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B", tensor_parallel_size=tp_size, diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml index fad36aac4d8..4e3417c732a 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml @@ -12,7 +12,6 @@ context_servers: backend: "DEFAULT" urls: - "localhost:8001" - use_torch_sampler: True generation_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_torch_sampler.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml similarity index 93% rename from tests/integration/defs/disaggregated/test_configs/disagg_config_torch_sampler.yaml rename to tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml index f4b06f1d14e..287d1103a4f 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_torch_sampler.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml @@ -11,7 +11,7 @@ context_servers: max_seq_len: 4096 tensor_parallel_size: 1 pipeline_parallel_size: 1 - use_torch_sampler: True + sampler_type: "TRTLLMSampler" kv_cache_config: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False @@ -27,7 +27,7 @@ generation_servers: max_batch_size: 256 max_num_tokens: 4096 max_seq_len: 4096 - use_torch_sampler: True + sampler_type: "TRTLLMSampler" kv_cache_config: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index c193a358197..5f871163d93 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -55,8 +55,8 @@ def get_test_config(test_desc, example_dir, test_root): (2, f"{test_configs_root}/disagg_config_cuda_graph_padding.yaml"), "mixed": (2, f"{test_configs_root}/disagg_config_mixed.yaml"), "overlap": (2, f"{test_configs_root}/disagg_config_overlap.yaml"), - "torch_sampler": - (2, f"{test_configs_root}/disagg_config_torch_sampler.yaml"), + "trtllm_sampler": + (2, f"{test_configs_root}/disagg_config_trtllm_sampler.yaml"), "load_balance": (4, f"{test_configs_root}/disagg_config_load_balance.yaml"), "cache_aware_balance": @@ -211,7 +211,7 @@ def run_disaggregated_test(example_dir, poll_procs=[workers_proc, server_proc]) # Run the chat completion endpoint test only for TinyLlama - if test_desc == "overlap" or test_desc == "torch_sampler": + if test_desc == "overlap" or test_desc == "trtllm_sampler": chat_client_cmd = client_cmd + [ '-e', 'chat', '-o', 'output_chat.json' ] @@ -234,7 +234,7 @@ def run_disaggregated_test(example_dir, not_expected_strings = ["Berlin Berlin"] output_files = ['output.json', 'output_streaming.json'] - if test_desc == "overlap" or test_desc == "torch_sampler": + if test_desc == "overlap" or test_desc == "trtllm_sampler": # Disable streaming chat completion for overlap test # due to bug output_files.extend(['output_chat.json']) @@ -488,9 +488,9 @@ def test_disaggregated_overlap(disaggregated_test_root, llm_venv, @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], indirect=True) -def test_disaggregated_torch_sampler(disaggregated_test_root, llm_venv, - disaggregated_example_root, - llama_model_root): +def test_disaggregated_trtllm_sampler(disaggregated_test_root, llm_venv, + disaggregated_example_root, + llama_model_root): src_dst_dict = { llama_model_root: f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0", @@ -501,7 +501,7 @@ def test_disaggregated_torch_sampler(disaggregated_test_root, llm_venv, os.symlink(src, dst, target_is_directory=True) run_disaggregated_test(disaggregated_example_root, - "torch_sampler", + "trtllm_sampler", env=llm_venv._new_env, cwd=llm_venv.get_working_directory()) @@ -566,7 +566,6 @@ def test_disaggregated_conditional(disaggregated_test_root, llm_venv, cwd=llm_venv.get_working_directory()) -@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5441714") @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'], indirect=True) def test_disaggregated_ngram(disaggregated_test_root, llm_venv, @@ -1246,8 +1245,8 @@ def get_config_for_benchmark(model_root, backend): def test_disaggregated_benchmark_on_diff_backends( disaggregated_test_root, disaggregated_example_root, llm_venv, benchmark_model_root, benchmark_root, shared_gpt_path): - nixl_config = get_config_for_benchmark(benchmark_model_root, "nixl") - ucx_config = get_config_for_benchmark(benchmark_model_root, "ucx") + nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL") + ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX") temp_dir = tempfile.TemporaryDirectory() nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml") ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml") diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 7d0b66ae1d2..0123679d477 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -619,7 +619,7 @@ def test_trtllm_bench_invalid_token_pytorch(llm_root, llm_venv, model_name, f"throughput " \ f"--dataset {str(dataset_path)} --backend pytorch " \ f"--extra_llm_api_options {extra_options_path} " \ - f"> {output_path}" + f"> {output_path} 2>&1" # Check clean shutdown (no hang) with pytest.raises(subprocess.CalledProcessError) as exc_info: check_call(benchmark_cmd, shell=True, env=llm_venv._new_env) @@ -629,7 +629,7 @@ def test_trtllm_bench_invalid_token_pytorch(llm_root, llm_venv, model_name, stdout = f.read() # Check that error is reported correctly - assert "Error during benchmarking: Requests failed: Token ID out of range (1 requests)" in stdout + assert "Requests failed: Token ID out of range (1 requests)" in stdout def trtllm_bench_prolog( diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py index 89605c2ebea..d6f4be2b05b 100644 --- a/tests/integration/defs/triton_server/test_triton_llm.py +++ b/tests/integration/defs/triton_server/test_triton_llm.py @@ -3726,6 +3726,12 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE, output = venv_check_output(llm_backend_venv, run_cmd) assert 'Request is cancelled' in output + # Test request cancellation for non-existing request and completed request + run_cmd = [ + f"{llm_backend_repo_root}/tools/tests/test_llmapi_cancel.py" + ] + output = venv_check_output(llm_backend_venv, run_cmd) + @pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"]) @pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"]) diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index b523aa1ecaa..ab43ce124f9 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -689,7 +689,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_torch_sampler[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8] disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 4c92e077d87..aeaa1ba573b 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -122,7 +122,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_torch_sampler[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 5e9b3b4da07..d0c2bda2b7d 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -217,5 +217,6 @@ l0_dgx_h100: terms: stage: post_merge backend: triton + auto_trigger: others tests: - triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 81736d3b02a..d0c181ebe13 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -252,8 +252,6 @@ unittest/trt/attention/test_gpt_attention.py -k "partition1" SKIP (https://nvbug unittest/trt/attention/test_gpt_attention.py -k "partition2" SKIP (https://nvbugs/5412456) unittest/trt/attention/test_gpt_attention.py -k "partition3" SKIP (https://nvbugs/5412456) test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5414909) -unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673) -unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673) examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288) examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067) examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068) @@ -287,8 +285,6 @@ examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (h accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5445466) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466) examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530) examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5453709) @@ -309,3 +305,18 @@ accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (htt accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5445466) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898) +examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5434372) +triton_server/test_triton.py::test_gpt_ib[gpt-ib] SKIP (https://nvbugs/5431116) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5457489) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489) +disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504) +accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197) +triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349) +triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) +triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343) +triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) +triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5456485) +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5434320) +accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437) +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437) diff --git a/tests/scripts/perf-sanity/run_benchmark_serve.py b/tests/scripts/perf-sanity/run_benchmark_serve.py index d9c69314879..2d4928ae325 100644 --- a/tests/scripts/perf-sanity/run_benchmark_serve.py +++ b/tests/scripts/perf-sanity/run_benchmark_serve.py @@ -308,10 +308,6 @@ def generate_extra_llm_api_config(self, test_case: Dict[str, Any]) -> str: " enable_block_reuse: false", ] - # https://nvbugs/5437106: WAR to avoid illegal memory access in Scout - if "Scout" in test_case['model']: - config_lines.append("use_torch_sampler: true") - # Add moe_config if moe_backend is specified if test_case['moe_backend']: config_lines.append("moe_config:") diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py index 269d43596e7..f5c993a785e 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py @@ -46,6 +46,7 @@ def create_nemotron_h_llm(use_cuda_graph, enable_block_reuse=False, mamba_ssm_cache_dtype="auto" if mamba_ssm_cache_dtype is None else mamba_ssm_cache_dtype), + sampler_type="TRTLLMSampler", ) diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py index 89f8521b12e..abad54e6bc3 100644 --- a/tests/unittest/_torch/multi_gpu/test_star_attention.py +++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py @@ -8,6 +8,7 @@ from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory +from tensorrt_llm.mapping import CpType from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig MAX_SEQ_LEN = 4096 + 1024 @@ -54,7 +55,7 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size, model_dir = str(llm_models_root() / model_name) cp_config = { - "cp_type": "star_attention", + "cp_type": CpType.STAR, "cp_anchor_size": sa_anchor_size, "block_size": sa_block_size } diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py index 658ec64fb5c..5c374d0f2aa 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py @@ -1,7 +1,6 @@ from difflib import SequenceMatcher import pytest -import torch from utils.llm_data import llm_models_root from tensorrt_llm import LLM, SamplingParams @@ -44,17 +43,19 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph, "This is a very long prompt to exercise long context. Count up to 10000 from 1, 2, 3," + ", ".join(str(i) for i in range(4, 9000)) }, - { - "prompt": "<|image|>This image is of color", - "multi_modal_data": { - "image": [torch.ones(3, 1024, 1024)] - } - }, + # TODO: Fix multimodal test. + # { + # "prompt": "<|image|>This image is of color", + # "multi_modal_data": { + # "image": [torch.ones(3, 1024, 1024)] + # } + # }, ] expected_outputs = [ - " the head of state and head of government of the", ", 8999, 9000, ", - " white. What is the color of the background of" + " the head of state and head of government of the", + ", 9000, 9001, ", + # " white. What is the color of the background of" # TODO: Fix multimodal test. ] pytorch_config = dict(attn_backend=backend) diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py index bbc2f1484e6..05e55b0ea7c 100644 --- a/tests/unittest/_torch/speculative/test_draft_target.py +++ b/tests/unittest/_torch/speculative/test_draft_target.py @@ -41,7 +41,6 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str): max_batch_size=max_batch_size, kv_cache_config=kv_cache_config, max_num_tokens=2048, - use_torch_sampler=True, ) spec_config = DraftTargetDecodingConfig( diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index 56228b4b77a..ffb8e33766a 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -60,7 +60,6 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, # in this test. max_seq_len=8192, enable_chunked_prefill=enable_chunked_prefill, - use_torch_sampler=True, ) if enable_chunked_prefill: # Use a small max_num_tokens so that the chunked prefill path gets exercised. diff --git a/tests/unittest/_torch/test_attention_mla.py b/tests/unittest/_torch/test_attention_mla.py index cef85f34576..41c37031bf5 100644 --- a/tests/unittest/_torch/test_attention_mla.py +++ b/tests/unittest/_torch/test_attention_mla.py @@ -339,7 +339,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: accuracy_dict = { torch.bfloat16: (3e-2, 3e-3), - torch.float8_e4m3fn: (4.075e-1, 4.075e-2), + torch.float8_e4m3fn: (4e-1, 4e-2), } diff --git a/tests/unittest/_torch/test_flashinfer_star_attn.py b/tests/unittest/_torch/test_flashinfer_star_attn.py index 7bad00724c3..ef19d2e3cd8 100644 --- a/tests/unittest/_torch/test_flashinfer_star_attn.py +++ b/tests/unittest/_torch/test_flashinfer_star_attn.py @@ -13,7 +13,7 @@ from tensorrt_llm._torch.metadata import KVCacheParams from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm.bindings.executor import KvCacheConfig -from tensorrt_llm.mapping import Mapping +from tensorrt_llm.mapping import CpType, Mapping class TestingStarAttentionMetadata(StarAttentionMetadata): @@ -144,7 +144,7 @@ def test_flashinfer_star_attention(self, scenario: Scenario): tokens_per_block = 64 max_seq_len = tokens_per_block * num_blocks cp_config = { - "cp_type": "star_attention", + "cp_type": CpType.STAR, "cp_anchor_size": scenario.anchor_size, "block_size": scenario.block_size } @@ -579,7 +579,7 @@ def test_attention_with_cuda_graphs( max_seq_len = tokens_per_block * num_blocks num_layers = 1 if isinstance(num_kv_heads, int) else len(num_kv_heads) cp_config = { - "cp_type": "star_attention", + "cp_type": CpType.STAR, "cp_anchor_size": test_scenario.anchor_size, "block_size": test_scenario.block_size } diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py index 5ac3044a2eb..8d7406aacc8 100644 --- a/tests/unittest/_torch/test_overlap_scheduler.py +++ b/tests/unittest/_torch/test_overlap_scheduler.py @@ -21,10 +21,10 @@ def model_path(): return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" -def create_llm(model_dir, disable_overlap_scheduler, use_torch_sampler): +def create_llm(model_dir, disable_overlap_scheduler, sampler_type): """Create LLM with specific overlap scheduler setting""" pytorch_config = dict(disable_overlap_scheduler=disable_overlap_scheduler, - use_torch_sampler=use_torch_sampler) + sampler_type=sampler_type) trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) @@ -41,16 +41,15 @@ def create_llm(model_dir, disable_overlap_scheduler, use_torch_sampler): ) -@pytest.mark.parametrize("use_torch_sampler", [False, True]) +@pytest.mark.parametrize("sampler_type", ["TorchSampler", "TRTLLMSampler"]) @pytest.mark.high_cuda_memory -def test_overlap_scheduler_consistency(model_path, test_case, - use_torch_sampler): +def test_overlap_scheduler_consistency(model_path, test_case, sampler_type): # Test configuration prompts = test_case["prompts"] max_new_tokens = test_case["max_new_tokens"] temperature = test_case["temperature"] top_p = test_case["top_p"] - stop_words = test_case["stop_words"] if not use_torch_sampler else None + stop_words = test_case["stop_words"] sampling_config = SamplingParams(max_tokens=max_new_tokens, stop=stop_words, @@ -62,7 +61,7 @@ def test_overlap_scheduler_consistency(model_path, test_case, # Test with overlap scheduler enabled llm = create_llm(model_path, disable_overlap_scheduler=False, - use_torch_sampler=use_torch_sampler) + sampler_type=sampler_type) outputs_with_overlap = llm.generate(prompts, sampling_params=sampling_config, use_tqdm=True) @@ -74,7 +73,7 @@ def test_overlap_scheduler_consistency(model_path, test_case, # Test with overlap scheduler disabled llm = create_llm(model_path, disable_overlap_scheduler=True, - use_torch_sampler=use_torch_sampler) + sampler_type=sampler_type) outputs_without_overlap = llm.generate(prompts, sampling_params=sampling_config, use_tqdm=True) diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py index 9010834a6f9..0d6a5e28ca6 100644 --- a/tests/unittest/_torch/test_return_logits.py +++ b/tests/unittest/_torch/test_return_logits.py @@ -16,10 +16,10 @@ @pytest.mark.parametrize("return_log_probs", [False, True]) @pytest.mark.parametrize("gather_generation_logits", [False, True]) @pytest.mark.parametrize("gather_context_logits", [False, True]) -@pytest.mark.parametrize("use_torch_sampler", [False, True]) +@pytest.mark.parametrize("sampler_type", ["TRTLLMSampler", "TorchSampler"]) @pytest.mark.parametrize("disable_overlap_scheduler", [False, True]) def test_generate_with_return_logits(disable_overlap_scheduler: bool, - use_torch_sampler: bool, + sampler_type: str, gather_context_logits: bool, gather_generation_logits: bool, return_log_probs: bool): @@ -27,7 +27,7 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool, or return_log_probs): # prune space pytest.skip("Nothing to test") - if use_torch_sampler and gather_context_logits: + if sampler_type == "TorchSampler" and gather_context_logits: pytest.skip("TorchSampler does not support gather_context_logits") build_config = BuildConfig() @@ -41,7 +41,7 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool, gather_generation_logits=gather_generation_logits, max_batch_size= 128, # reduce buffer sizes, specially for generation logits - use_torch_sampler=use_torch_sampler, + sampler_type=sampler_type, disable_overlap_scheduler=disable_overlap_scheduler, ) @@ -83,10 +83,10 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool, @pytest.mark.parametrize("return_log_probs", [False, True]) @pytest.mark.parametrize("gather_generation_logits", [False, True]) @pytest.mark.parametrize("gather_context_logits", [False, True]) -@pytest.mark.parametrize("use_torch_sampler", [False, True]) +@pytest.mark.parametrize("sampler_type", ["TRTLLMSampler", "TorchSampler"]) @pytest.mark.parametrize("disable_overlap_scheduler", [False, True]) def test_generate_async_with_return_logits(disable_overlap_scheduler: bool, - use_torch_sampler: bool, + sampler_type: str, gather_context_logits: bool, gather_generation_logits: bool, return_log_probs: bool): @@ -94,7 +94,7 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool, or return_log_probs): # prune space pytest.skip("Nothing to test") - if use_torch_sampler and gather_context_logits: + if sampler_type == "TorchSampler" and gather_context_logits: pytest.skip("TorchSampler does not support gather_context_logits") build_config = BuildConfig() @@ -108,7 +108,7 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool, gather_generation_logits=gather_generation_logits, max_batch_size= 128, # reduce buffer sizes, specially for generation logits - use_torch_sampler=use_torch_sampler, + sampler_type=sampler_type, disable_overlap_scheduler=disable_overlap_scheduler, ) sampling_params = SamplingParams( diff --git a/tests/unittest/api_stability/api_stability_core.py b/tests/unittest/api_stability/api_stability_core.py index 2278fad2011..61650d59097 100644 --- a/tests/unittest/api_stability/api_stability_core.py +++ b/tests/unittest/api_stability/api_stability_core.py @@ -27,6 +27,7 @@ from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput, GuidedDecodingParams, QuantConfig, RequestOutput, SamplingParams) +from tensorrt_llm.llmapi.llm_args import SamplerType from tensorrt_llm.llmapi.llm_utils import LlmArgs from tensorrt_llm.logger import Singleton diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index 5a846dd7869..86f740c3844 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -111,9 +111,9 @@ methods: annotation: bool default: False status: beta - use_torch_sampler: - annotation: bool - default: False + sampler_type: + annotation: Union[str, tensorrt_llm.llmapi.llm_args.SamplerType] + default: auto status: beta enable_iter_perf_stats: annotation: bool diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py index e59a0fae9fa..6e58b094783 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat.py @@ -533,10 +533,10 @@ def test_stop_reason(client: openai.OpenAI, model_name: str, backend: str): 'server_with_custom_sampler', [ { - 'use_torch_sampler': True + 'sampler_type': "TorchSampler" }, # torch_sampler { - 'use_torch_sampler': False + 'sampler_type': "TRTLLMSampler" }, # trtllm_sampler ], indirect=True, diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py index ab3c5ac58c7..d92ca061672 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py @@ -32,7 +32,6 @@ def temp_extra_llm_api_options_file(request): "build_config": { "max_num_tokens": 16384, }, - "use_torch_sampler": True, } with open(temp_file_path, 'w') as f: diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py index 4762f219960..3e1c96cff3c 100644 --- a/tests/unittest/llmapi/apps/_test_openai_completions.py +++ b/tests/unittest/llmapi/apps/_test_openai_completions.py @@ -395,10 +395,10 @@ async def test_completion_streaming(async_client: openai.AsyncOpenAI, 'server_with_custom_sampler', [ { - 'use_torch_sampler': True + 'sampler_type': "TorchSampler" }, # torch_sampler { - 'use_torch_sampler': False + 'sampler_type': "TRTLLMSampler" }, # trtllm_sampler ], indirect=True, diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py index a86301a6748..5b28e12675c 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py @@ -32,7 +32,6 @@ def temp_extra_llm_api_options_file(request): "build_config": { "max_num_tokens": 16384, }, - "use_torch_sampler": True, } with open(temp_file_path, 'w') as f: diff --git a/tests/unittest/llmapi/apps/utils.py b/tests/unittest/llmapi/apps/utils.py index ae78a3180bb..073760d51f3 100644 --- a/tests/unittest/llmapi/apps/utils.py +++ b/tests/unittest/llmapi/apps/utils.py @@ -151,8 +151,7 @@ def make_server_with_custom_sampler_fixture(api_type: str) -> Callable: def server_with_custom_sampler(model_name: str, request: Any, backend: str, tmp_path: Path) -> RemoteOpenAIServer: '''Fixture to launch a server (pytorch backend only) with a custom sampler configuration.''' - use_torch_sampler = getattr(request, 'param', - {}).get('use_torch_sampler', True) + sampler_type = getattr(request, 'param', {}).get('sampler_type', "auto") if backend != 'pytorch': pytest.skip( f"Server with custom sampler is only supported for pytorch backend, skipping for {backend}" @@ -162,7 +161,7 @@ def server_with_custom_sampler(model_name: str, request: Any, backend: str, temp_file_path = tmp_path / f'test_sampler_config_{request.node.name}.yaml' extra_llm_api_options_dict = { 'enable_chunked_prefill': True, - 'use_torch_sampler': use_torch_sampler + 'sampler_type': sampler_type } with temp_file_path.open('w') as f: yaml.dump(extra_llm_api_options_dict, f) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index be0de40eb65..6b78c46bd73 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -255,14 +255,11 @@ def test_embedding_bias_with_torch_sampler_strategies(enable_mixed_sampler, sampling_params = SamplingParams(**sampling_kwargs) - llm_test_harness( - llama_model_path, - prompts, - ["Z Z Z Z Z Z"], - sampling_params=sampling_params, - backend="pytorch", - use_torch_sampler=True, # Use TorchSampler to test all 3 paths - enable_mixed_sampler=enable_mixed_sampler) + llm_test_harness(llama_model_path, + prompts, ["Z Z Z Z Z Z"], + sampling_params=sampling_params, + backend="pytorch", + enable_mixed_sampler=enable_mixed_sampler) def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None: diff --git a/tests/unittest/others/test_mapping.py b/tests/unittest/others/test_mapping.py index 6d836f220b0..bc9839239bf 100644 --- a/tests/unittest/others/test_mapping.py +++ b/tests/unittest/others/test_mapping.py @@ -44,3 +44,40 @@ def test_mapping(self): self.assertTrue(m.is_last_pp_rank()) self.assertEqual(m.prev_pp_rank(), 4) self.assertEqual(m.next_pp_rank(), 0) + + m = Mapping(world_size=2, rank=0, cp_size=2) + self.assertEqual(len(m.tp_groups), 2) + self.assertEqual(len(m.pp_groups), 2) + self.assertEqual(len(m.cp_groups), 1) + self.assertEqual(m.tp_group, [0]) + self.assertEqual(m.pp_group, [0]) + self.assertEqual(m.cp_group, [0, 1]) + + m = Mapping(world_size=8, rank=3, tp_size=2, pp_size=2, cp_size=2) + self.assertEqual(len(m.tp_groups), 4) + self.assertEqual(len(m.pp_groups), 4) + self.assertEqual(len(m.cp_groups), 4) + self.assertEqual(m.tp_group, [2, 3]) + self.assertEqual(m.pp_group, [3, 7]) + self.assertEqual(m.cp_group, [1, 3]) + self.assertTrue(m.is_first_pp_rank()) + self.assertFalse(m.is_last_pp_rank()) + self.assertFalse(m.is_first_cp_rank()) + self.assertTrue(m.is_last_cp_rank()) + self.assertEqual(m.prev_pp_rank(), 7) + self.assertEqual(m.next_pp_rank(), 7) + self.assertEqual(m.prev_cp_rank(), 1) + self.assertEqual(m.next_cp_rank(), 1) + + m = Mapping(world_size=16, rank=9, tp_size=2, pp_size=2, cp_size=4) + self.assertEqual(m.tp_group, [8, 9]) + self.assertEqual(m.pp_group, [1, 9]) + self.assertEqual(m.cp_group, [9, 11, 13, 15]) + self.assertFalse(m.is_first_pp_rank()) + self.assertTrue(m.is_last_pp_rank()) + self.assertTrue(m.is_first_cp_rank()) + self.assertFalse(m.is_last_cp_rank()) + self.assertEqual(m.prev_pp_rank(), 1) + self.assertEqual(m.next_pp_rank(), 1) + self.assertEqual(m.prev_cp_rank(), 15) + self.assertEqual(m.next_cp_rank(), 11) diff --git a/tests/unittest/test_pip_install.py b/tests/unittest/test_pip_install.py index d75bfbaf420..11288e09cec 100644 --- a/tests/unittest/test_pip_install.py +++ b/tests/unittest/test_pip_install.py @@ -51,9 +51,6 @@ def test_pip_install(): help="The wheel path") args = parser.parse_args() - if not os.environ.get("CUDA_HOME"): - os.environ["CUDA_HOME"] = "/usr/local/cuda" - print("########## Install required system libs ##########") if not os.path.exists("/usr/local/mpi/bin/mpicc"): subprocess.check_call("apt-get -y install libopenmpi-dev", shell=True) diff --git a/triton_backend/tools/tests/test_llmapi_cancel.py b/triton_backend/tools/tests/test_llmapi_cancel.py new file mode 100644 index 00000000000..4cd8c0c6063 --- /dev/null +++ b/triton_backend/tools/tests/test_llmapi_cancel.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +from functools import partial + +import numpy as np +from tritonclient import grpc as grpcclient +from tritonclient.utils import InferenceServerException + +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + '/..') +from llmapi_client import (UserData, _prepare_inputs, callback, + prepare_stop_signals) + +if __name__ == "__main__": + input_data = np.array([ + "The current time is", + ], dtype=object) + output_len = 100 + inputs = _prepare_inputs(input_data, output_len) + + stop_inputs = prepare_stop_signals() + request_id = 1 + user_data = UserData() + with grpcclient.InferenceServerClient( + url="localhost:8001", + verbose=False, + ssl=False, + root_certificates=None, + private_key=None, + certificate_chain=None, + ) as triton_client: + + # Send stop request for non-existing request + triton_client.async_infer( + "tensorrt_llm", + stop_inputs, + request_id=str(request_id), # Request does not exist yet + callback=partial(callback, user_data), + parameters={'Streaming': False}) + + result = user_data._completed_requests.get() + assert isinstance(result, InferenceServerException) + assert result.status() == "StatusCode.CANCELLED" + + # Send actual request + infer_response = triton_client.async_infer( + "tensorrt_llm", + inputs, + request_id=str(request_id), + callback=partial(callback, user_data), + parameters={'Streaming': False}) + + result = user_data._completed_requests.get() + print( + f'Output text: {result.as_numpy("text_output")[0].decode("utf-8")}') + + # Cancel request after it is completed + infer_response.cancel() + + # Send stop request for completed request + triton_client.async_infer("tensorrt_llm", + stop_inputs, + request_id=str(request_id), + callback=partial(callback, user_data), + parameters={'Streaming': False}) + + cancel_result = user_data._completed_requests.get() + assert isinstance(cancel_result, InferenceServerException) + assert cancel_result.status() == "StatusCode.CANCELLED" + + # Send a second request to check if server is still healthy + infer_response_2 = triton_client.async_infer( + "tensorrt_llm", + inputs, + request_id=str(request_id + 1), + callback=partial(callback, user_data), + parameters={'Streaming': False}) + + # Get result of second request + result_2 = user_data._completed_requests.get() + print('Got completed request') + + print( + f'Output text: {result_2.as_numpy("text_output")[0].decode("utf-8")}' + ) + + # Check that both results match + assert np.array_equal(result.as_numpy("text_output"), + result_2.as_numpy("text_output"))