diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md index 6f1383f3ef8..9bccba451c7 100644 --- a/docs/source/installation/linux.md +++ b/docs/source/installation/linux.md @@ -32,6 +32,7 @@ ```bash pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm ``` + **This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.** 2. Sanity check the installation by running the following in Python (tested on Python 3.12): diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 3f55a4e1095..9e316617186 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -28,101 +28,119 @@ nvidia/Llama-3.1-405B-Instruct-FP4 ``` #### Llama 3.3 70B FP4 + | | GPU | B200 | | | | -|:-----------------------------|:---|:----------|:----------|:----------|:----------| -| | TP Size | 1 | 2 | 4 | 8 | -| ISL, OSL| | | | | | -| | | | | | | -| 128, 128 | | 11,253.28 | 17,867.66 | 24,944.50 | 27,471.49 | -| 128, 2048 | | 9,925.00 | 15,459.71 | 23,608.58 | 30,742.86 | -| 128, 4096 | | 6,318.92 | 8,711.88 | 17,659.74 | 24,947.05 | -| 500, 2000 | | 7,559.88 | 10,602.27 | 20,910.23 | 28,182.34 | -| 1000, 1000 | | 6,866.96 | 10,838.01 | 16,567.86 | 19,991.64 | -| 1000, 2000 | | 6,736.88 | 9,132.08 | 15,737.02 | 20,518.04 | -| 1024, 2048 | | 6,580.56 | 8,767.45 | 15,722.55 | 20,437.96 | -| 2048, 128 | | 1,375.49 | 1,610.69 | 2,707.58 | 3,717.82 | -| 2048, 2048 | | 4,544.73 | 6,956.14 | 12,292.23 | 15,661.22 | -| 5000, 500 | | 1,488.19 | 2,379.73 | 3,588.45 | 4,810.21 | -| 20000, 2000 | | 580.96 | 1,043.58 | 1,957.84 | 3,167.30 | +|:------------------------|:--------|:----------|:----------|:----------|:----------| +| | TP Size | 1 | 2 | 4 | 8 | +| ISL, OSL | | | | | | +| | | | | | | +| 128, 128 | | 10,994.48 | 17,542.11 | 24,667.31 | 27,272.27 | +| 128, 2048 | | 9,580.46 | 15,432.35 | 23,568.12 | 31,174.31 | +| 128, 4096 | | 6,418.39 | 9,841.53 | 17,808.76 | 25,229.25 | +| 500, 2000 | | 7,343.32 | 11,850.57 | 20,709.67 | 28,038.78 | +| 1000, 1000 | | 6,752.53 | 10,815.88 | 16,413.04 | 20,060.66 | +| 1000, 2000 | | 6,670.07 | 9,830.73 | 15,597.49 | 20,672.37 | +| 1024, 2048 | | 6,636.75 | 9,807.13 | 15,519.23 | 20,617.28 | +| 2048, 128 | | 1,342.17 | 1,989.41 | 3,033.14 | 4,035.64 | +| 5000, 500 | | 1,429.67 | 2,419.67 | 3,686.84 | 5,182.96 | +| 20000, 2000 | | 629.77 | 1,177.01 | 2,120.66 | 3,429.03 | #### Llama 3.1 405B FP4 -| | GPU | B200 | -|:-----------------------------|:---|:----------| -| | TP Size | 8 | -| ISL, OSL| | | -| | | | -| 128, 128 | | 9,184.83 | -| 128, 2048 | | 10,387.23 | -| 128, 4096 | | 8,741.80 | -| 500, 2000 | | 9,242.34 | -| 1000, 1000 | | 7,565.50 | -| 1000, 2000 | | 7,696.76 | -| 1024, 2048 | | 7,568.93 | -| 2048, 128 | | 953.57 | -| 2048, 2048 | | 6,092.32 | -| 5000, 500 | | 1,332.22 | -| 20000, 2000 | | 961.58 | + +| | GPU | B200 | | +|:------------------------|:------- |:---------|:----------| +| | TP Size | 4 | 8 | +| ISL, OSL | | | | +| | | | | +| 128, 128 | | 6,163.81 | 9,002.90 | +| 128, 2048 | | 7,081.21 | 10,288.28 | +| 128, 4096 | | 6,028.37 | 8,713.77 | +| 500, 2000 | | 5,858.75 | 9,125.86 | +| 1000, 1000 | | 4,848.00 | 7,582.97 | +| 1000, 2000 | | 5,375.25 | 7,626.28 | +| 1024, 2048 | | 5,345.70 | 7,464.03 | +| 2048, 128 | | 693.55 | 1,086.56 | +| 5000, 500 | | 947.49 | 1,532.45 | +| 20000, 2000 | | 641.11 | 1,097.84 | ### FP8 Models: ``` nvidia/Llama-3.1-8B-Instruct-FP8 -nvidia/Llama-3.1-70B-Instruct-FP8 +nvidia/Llama-3.3-70B-Instruct-FP8 nvidia/Llama-3.1-405B-Instruct-FP8 +nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 ``` #### Llama 3.1 8B FP8 -| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | + +| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | |:-----------------------------|:---|:------------------|:-----------------| -| | TP Size | 1 | 1 | +| | TP Size | 1 | 1 | | ISL, OSL | | | | | | | | | -| 128, 128 | | 28,447.38 | 27,568.68 | -| 128, 2048 | | 23,294.74 | 22,003.62 | -| 128, 4096 | | 17,481.48 | 13,640.35 | -| 500, 2000 | | 21,462.57 | 17,794.39 | -| 1000, 1000 | | 17,590.60 | 15,270.02 | -| 1000, 2000 | | 17,139.51 | 13,850.22 | -| 1024, 2048 | | 16,970.63 | 13,374.15 | -| 2048, 128 | | 3,531.33 | 3,495.05 | -| 2048, 2048 | | 12,022.38 | 9,653.67 | -| 5000, 500 | | 3,851.65 | 3,371.16 | -| 20000, 2000 | | 1,706.06 | 1,340.92 | - -#### Llama 3.1 70B FP8 -| | GPU | H200 141GB HBM3 | | | | H100 80GB HBM3 | | | | +| 128, 128 | | 27,970.14 | 27,688.36 | +| 128, 2048 | | 23,326.38 | 21,841.15 | +| 128, 4096 | | 17,508.51 | 13,730.89 | +| 500, 2000 | | 21,390.41 | 17,833.34 | +| 1000, 1000 | | 17,366.89 | 15,270.62 | +| 1000, 2000 | | 16,831.31 | 13,798.08 | +| 1024, 2048 | | 16,737.03 | 13,385.50 | +| 2048, 128 | | 3,488.03 | 3,414.67 | +| 5000, 500 | | 3,813.69 | 3,394.54 | +| 20000, 2000 | | 1,696.66 | 1,345.42 | + +#### Llama 3.3 70B FP8 + +| | GPU | H200 141GB HBM3 | | | | H100 80GB HBM3 | | | | |:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------| -| | TP Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | -| ISL, OSL| | | | | | | | | | +| | TP Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | +| ISL, OSL | | | | | | | | | | | | | | | | | | | | | -| 128, 128 | | 3,657.58 | 6,477.50 | 10,466.04 | 15,554.57 | 3,191.27 | 6,183.41 | 10,260.68 | 14,686.01 | -| 128, 2048 | | 4,351.07 | 8,450.31 | 13,438.71 | 20,750.58 | 745.19 | 5,822.02 | 11,442.01 | 17,463.99 | -| 128, 4096 | | 2,696.61 | 5,598.92 | 11,524.93 | 16,634.90 | | 3,714.87 | 8,209.91 | 12,598.55 | -| 500, 2000 | | 3,475.58 | 6,712.35 | 12,332.32 | 17,311.28 | | 4,704.31 | 10,278.02 | 14,630.41 | -| 1000, 1000 | | 2,727.42 | 5,097.36 | 8,698.15 | 12,794.92 | 734.67 | 4,191.26 | 7,427.35 | 11,082.48 | -| 1000, 2000 | | 2,913.54 | 5,841.15 | 9,016.49 | 13,174.68 | 526.31 | 3,920.44 | 7,590.35 | 11,108.11 | -| 1024, 2048 | | 2,893.02 | 5,565.28 | 9,017.72 | 13,117.34 | 525.43 | 3,896.14 | 7,557.32 | 11,028.32 | -| 2048, 128 | | 433.30 | 772.97 | 1,278.26 | 1,947.33 | 315.90 | 747.51 | 1,240.12 | 1,840.12 | -| 2048, 2048 | | 1,990.25 | 3,822.83 | 7,068.68 | 10,529.06 | 357.98 | 2,732.86 | 5,640.31 | 8,772.88 | -| 5000, 500 | | 543.88 | 1,005.81 | 1,714.77 | 2,683.22 | 203.27 | 866.77 | 1,571.92 | 2,399.78 | -| 20000, 2000 | | 276.99 | 618.01 | 1,175.35 | 2,021.08 | | 408.43 | 910.77 | 1,568.84 | +| 128, 128 | | 3,605.47 | 6,427.69 | 10,407.42 | 15,434.37 | 3,128.33 | 6,216.91 | | | +| 128, 2048 | | 4,315.80 | 8,464.03 | 13,508.59 | 20,759.72 | 756.42 | 5,782.57 | 11,464.94 | 17,424.32 | +| 128, 4096 | | 2,701.17 | 5,573.55 | 11,458.56 | 16,668.75 | | 3,868.37 | 8,206.39 | 12,624.61 | +| 500, 2000 | | 3,478.76 | 6,740.06 | 12,200.18 | | | 4,684.06 | 9,903.53 | 14,553.93 | +| 1000, 1000 | | 2,744.32 | 5,119.72 | 8,685.44 | 12,744.51 | 742.14 | 4,247.19 | 7,435.65 | 11,018.81 | +| 1000, 2000 | | 2,896.44 | 5,847.26 | 9,031.21 | 13,141.17 | 533.74 | 3,866.53 | 7,611.12 | 11,139.22 | +| 1024, 2048 | | 2,874.18 | 5,568.61 | 8,946.71 | 13,082.62 | 530.16 | 3,796.68 | 7,575.24 | 11,004.31 | +| 2048, 128 | | 435.90 | 772.67 | 1,264.76 | | | 736.89 | 1,213.33 | 1,839.22 | +| 2048, 2048 | | | | | 10,412.85 | | | | | +| 5000, 500 | | 545.96 | 997.15 | 1,698.22 | 2,655.28 | 204.94 | 862.91 | 1,552.68 | 2,369.84 | +| 20000, 2000 | | 276.66 | 620.33 | 1,161.29 | 1,985.85 | | 416.13 | 903.66 | 1,554.10 | #### Llama 3.1 405B FP8 -| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | + +| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | |:-----------------------------|:---|:------------------|:-----------------| -| | TP Size | 8 | 8 | +| | TP Size | 8 | 8 | | ISL, OSL | | | | | | | | | -| 128, 128 | | 3,800.11 | 3,732.40 | -| 128, 2048 | | 5,661.13 | 4,572.23 | -| 128, 4096 | | 5,167.18 | 2,911.42 | -| 500, 2000 | | 4,854.29 | 3,661.85 | -| 1000, 1000 | | 3,332.15 | 2,963.36 | -| 1000, 2000 | | 3,682.15 | 3,253.17 | -| 1024, 2048 | | 3,685.56 | 3,089.16 | -| 2048, 128 | | 453.42 | 448.89 | -| 2048, 2048 | | 3,055.73 | 2,139.94 | -| 5000, 500 | | 656.11 | 579.14 | -| 20000, 2000 | | 514.02 | 370.26 | +| 128, 2048 | | 5,567.87 | | +| 128, 4096 | | 5,136.85 | | +| 500, 2000 | | 4,787.61 | 3,673.91 | +| 1000, 1000 | | 3,286.30 | 3,012.22 | +| 1000, 2000 | | 3,636.76 | 3,262.20 | +| 1024, 2048 | | 3,618.66 | 3,109.70 | +| 2048, 128 | | 443.10 | 449.02 | +| 5000, 500 | | 645.46 | | +| 20000, 2000 | | | 372.12 | + +#### Llama 4 Maverick FP8 + +| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | +|:-----------------------------|:---|:------------------|:-----------------| +| | TP Size | 8 | 8 | +| ISL, OSL | | | | +| | | | | +| 128, 2048 | | 27,543.87 | | +| 128, 4096 | | 18,541.01 | 11,163.12 | +| 500, 2000 | | 21,117.34 | | +| 1000, 2000 | | | 10,556.00 | +| 1024, 2048 | | 16,859.45 | 11,584.33 | +| 2048, 128 | | 4,364.06 | 3,832.38 | +| 2048, 2048 | | 12,800.89 | | +| 5000, 500 | | 5,128.60 | | +| 20000, 2000 | | 1,764.27 | 1,400.79 | ## Reproducing Benchmarked Results @@ -198,6 +216,8 @@ a model name (HuggingFace reference or path to a local model), a [generated data trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options ``` +The data collected for the v0.20 benchmarks was run with the following file: + `llm_options.yml` ```yaml cuda_graph_config: @@ -220,7 +240,7 @@ cuda_graph_config: - 8192 ``` -In majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue. +In a majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue. The results will be printed to the terminal upon benchmark completion. For example, diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md index b3027e0737a..12b9a5ec037 100644 --- a/docs/source/quick-start-guide.md +++ b/docs/source/quick-start-guide.md @@ -8,13 +8,15 @@ This is the starting point to try out TensorRT-LLM. Specifically, this Quick Sta There are multiple ways to install and run TensorRT-LLM. For most users, the options below should be ordered from simple to complex. The approaches are equivalent in terms of the supported features. +Note: **This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.** + 1. [](installation/containers) 1. Pre-built release wheels on [PyPI](https://pypi.org/project/tensorrt-llm) (see [](installation/linux)) 1. [Building from source](installation/build-from-source-linux) -The following examples can most easily be executed using the prebuilt [Docker release container available on NGC](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/tensorrt-llm/containers/release) (see also [release.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/release.md) on GitHub). +The following examples can most easily be executed using the prebuilt [Docker release container available on NGC](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/tensorrt-llm/containers/release) (see also [release.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/release.md) on GitHub). Ensure to run these commands as a user with appropriate permissions, preferably `root`, to streamline the setup process. ## LLM API @@ -92,7 +94,7 @@ For detailed examples and command syntax, refer to the [trtllm-serve](commands/t 2. Open a new terminal and use the following command to directly attach to the running container: -```bash +```bash:docs/source/quick-start-guide.md docker exec -it bash ``` diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md index 37fada2c0de..0c59baf992b 100644 --- a/docs/source/reference/support-matrix.md +++ b/docs/source/reference/support-matrix.md @@ -25,6 +25,8 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B` | L | | `Qwen2VLForConditionalGeneration` | Qwen2-VL | `Qwen/Qwen2-VL-7B-Instruct` | L + V | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | `Qwen/Qwen2.5-VL-7B-Instruct` | L + V | +| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B` | L | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B` | L | Note: - L: Language only @@ -72,7 +74,7 @@ Note: - [mT5](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/enc_dec) - [OPT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/contrib/opt) - [Phi-1.5/Phi-2/Phi-3](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/phi) -- [Qwen/Qwen1.5/Qwen2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/qwen) +- [Qwen/Qwen1.5/Qwen2/Qwen3](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/qwen) - [Qwen-VL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/qwenvl) - [RecurrentGemma](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/recurrentgemma) - [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/contrib/mpt) [^replitcode] diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md index bb663aba7d2..dee84ecfde5 100644 --- a/docs/source/release-notes.md +++ b/docs/source/release-notes.md @@ -4,6 +4,152 @@ All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/). +## TensorRT-LLM Release 0.21.0 + +### Key Features and Enhancements +- **Model Support** + - Added Gemma3 VLM support +- **Features** + - Added large-scale EP support + - Integrated NIXL into the communication layer of the disaggregated service + - Added fabric Memory support for KV Cache Transfer + - Added MCP in ScaffoldingLLM + - Added support for w4a8_mxfp4_fp8 quantization + - Added support for fp8 rowwise quantization + - Added generation logits support in TRTLLM Sampler + - Added log probs support in TRTLLM Sampler + - Optimized TRTLLM Sampler perf single beam single step + - Enabled Disaggregated serving for Qwen-3 + - Added EAGLE3 support for Qwen-3 + - Fused finalize and allreduce for Qwen-MoE model + - Refactored Fused MoE module + - Added support for chunked attention on Blackwell and Hopper + - Introduced sliding-window attention kernels for the generation phase on Blackwell + - Updated DeepSeek FP8 TRT-LLM Gen cubins to improve performance in large batch size scenarios + - Added FP8 block-scale GEMM support on SM89 + - Enabled overlap scheduler between draft forwards + - Added Piecewise cuda graph support for MLA + - Added model-agnostic one-engine eagle3 + - Enabled Finalize + Allreduce + add + rmsnorm fusion + - Integrated TRT-LLM Gen FP8 block scale MoE with Pytorch workflow kernel autotuner + - Added support for Eagle3 + disaggregated serving in two model speculative decoding flow + - Validated Llama 3.1 models on H200 NVL +- Benchmark: + - Added all_reduce.py benchmark script for testing + - Added beam width to trtllm-bench latency command + - Fixed trtllm-bench iter_stats and cuda_graph_batch_sizes errors + - Enabled trtllm-bench to run LoRA and add basic e2e perf testing capability for LoRA + - Supported post_proc for bench + - Added no_kv_cache_reuse option and streaming support for trtllm serve bench + +### Infrastructure Changes +- The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.05-py3`. +- The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.05-py3`. +- The dependent public PyTorch version is updated to 2.7.1. +- The dependent TensorRT version is updated to 10.11. +- The dependent NVIDIA ModelOpt version is updated to 0.31. +- The dependent NCCL version is updated to 2.27.5. + +### API Changes +- Set _AutoDeployLlmArgs as primary config object +- Removed decoder request from decoder interface +- Enhanced the torch_compile_config in llm args +- Removed the redundant use_kv_cache field from PytorchConfig +- Moved allreduce_strategy from committed api to reference + +### Fixed Issues +- Fixed disaggregated service hang when MNNVL two-shot AllReduce is enabled (#4678) +- Fixed EP load balancer with MTP layer and route offset by EP rank (#4767) +- Fixed cuda graph padding for spec decoding (#4853) +- Fixed llama 4 long context issue (#4809) +- Fixed max_num_sequences calculation with overlap scheduling (#4532) +- Fixed chunked prefill + overlap scheduling (#5761) +- Fixed trtllm-bench hang issue due to LLM API IPC (#4798) +- Fixed index out of bounds error in spec decoding (#5954) +- Fixed MTP illegal memory access in cuda graph warmup (#5947) +- Fixed no free slots error with spec decode + disagg (#5975) +- Fixed one-off attention window size for Gemma3 1B (#5564) + +### Known Issues +- accuracy/test_cli_flow::TestGpt2::test_beam_search_large is broken. +- Enabling disaggregated serving, MTP, and the overlap scheduler at the same time can lead to accuracy problems. + +## TensorRT-LLM Release 0.20.0 + +### Key Features and Enhancements +- **Model Support** + - Added Qwen3 support.Refer to “Qwen3” section in `examples/models/core/qwen/README.md`. + - Added HyperCLOVAX-SEED-Vision support in PyTorch flow. Refer to `examples/models/contrib/hyperclovax/README.md` + - Added Dynasor-CoT in scaffolding examples. Refer to `examples/scaffolding/contrib/Dynasor/README.md` + - Added Mistral Small 3.1 24B VLM support in TRT workflow + - Added Gemma3-1b-it support in PyTorch workflow + - Added Nemotron-H model support + - Added Eagle-3 support for LLAMA4 +- **PyTorch workflow** + - Added lora support + - Added return logits support + - Adopt new logprob definition in PyTorch flow + - Enabled per-request stats with PyTorch backend + - Enabled LogitsProcessor in PyTorch backend +- Benchmark: + - Add beam width to low latency. + - Fix trtllm-bench iter_stats and cuda_graph_batch_sizes errors. + - Remove deprecated Python runtime benchmark + - Add benchmark support for scaffolding +- Multimodal models + - Added support in trtllm-serve + - Added support in trtllm-bench, the support is limited to image only for now +- Supported DeepSeek-R1 W4A8 on Hopper +- Add the RTX Pro 6000 support on single GPU +- Integrated Llama4 input processor +- Added CGA reduction FHMA kernels on Blackwell +- Enabled chunked context for FlashInfer +- Supported KV cache reuse for MLA +- Added Piecewise CUDA Graph support +- Supported multiple LoRA adapters and TP +- Added KV cache-aware router for disaggregated serving +- Unfused attention for native support +- Added group_rms_norm kernel to normalize multiple inputs in a single operator +- Added smart router for the MoE module +- Added head size 72 support for QKV preprocessing kernel +- Added MNNVL MoE A2A support +- Optimized Large Embedding Tables in Multimodal Models +- Supported Top-K logprobs and prompt_logprobs in LLMAPI +- Enabled overlap scheduler in TRT workflow via executor API + +### Infrastructure Changes +- **TRT-LLM team formally releases docker image on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)**. +- The pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 now, which uses the CXX11 ABI +- The dependent TensorRT version is updated to 10.10.0 +- The dependent CUDA version is updated to 12.9.0 +- The dependent public PyTorch version is updated to 2.7.0 +- The dependent NVIDIA ModelOpt version is updated to 0.29.0 +- The dependent NCCL version is maintained at 2.25.1 +- Open-sourced XQA kernels +- Dependent datasets version was upgraded to 3.1.0 +- Migrate Triton Backend to TensorRT LLM repo to TensorRT LLM submodule +- Downgrade gcc toolset version from 13 to 11 + +### API Changes +- [Breaking Change]:Enable scheduling overlap by default +- Remove deprecated GptSession/V1 from TRT workflow +- Set _AutoDeployLlmArgs as primary config object +- Allow overriding CLI arguments with YAML file in trtllm-serve +- Introduced multimodal embedding field in LlmRequest + + +### Fixed Issues +- Fix hang bug when context server doesn't have enough capacity for KV Cache (#3095) +- Fix C++ decoder synchronization in PyTorch (#3106) +- Fix bug of create cuda stream as default parameter which will be initialized during importing (#3764) +- Fix bug related to creating CUDA stream as default parameter, which will be initialized during importing (#3764) +- Fix attention DP bug on Qwen3 MoE model (#4141) +- Fix illegal memory access when running LLaMA 4 with CUDA Graph enabled (#4101) +- Reset planned states to avoid memory leak in TrtllmAttentionWrapper (#4227) + +### Known Issues +- multi-GPU model support on RTX Pro 6000 + ## TensorRT-LLM Release 0.19.0 diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index b1653951ac5..c8523deea2e 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -38,6 +38,7 @@ from tqdm import tqdm from transformers import PretrainedConfig +from tensorrt_llm._ipc_utils import can_access_peer from tensorrt_llm._utils import get_sm_version from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.llmapi.utils import enable_llm_debug @@ -602,6 +603,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], self.enable_attention_dp = mapping.enable_attention_dp self.mlp_tp_size = mapping.tp_size + self.is_p2p_supported = can_access_peer(mapping) self.fusion_config = EagerFusionConfig() self.enable_fusion = os.environ.get( @@ -796,7 +798,7 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize): not (hidden_states.shape[0] <= self.moe_allreduce.max_token and self.fusion_config.POST_MOE_FUSION and self.model_config.moe_backend == "TRTLLM" - and self.mlp.experts.has_nvfp4)) + and self.mlp.experts.has_nvfp4 and self.is_p2p_supported)) hidden_states = _run_MoE(hidden_states, hidden_states_fp4=None, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 1a22caf2d7d..3e364ac9a91 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -1216,7 +1216,8 @@ def _prepare_tp_inputs( if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None: # get token ids, including input token ids and draft token ids. For these dummy requests, # no need to copy the token ids. - if not request.is_dummy: + if not (request.is_attention_dp_dummy + or request.is_cuda_graph_dummy): input_ids.append(request.get_last_tokens(0)) input_ids.extend(request.py_draft_tokens) draft_tokens.extend(request.py_draft_tokens) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 5b440e8b90e..934813aa4c4 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -544,14 +544,6 @@ def _check_arguments(self, prompt_len: int, query_len: int, raise ValueError( f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead." ) - # Check prompt length and query length against max_num_tokens to filter illegal requests. - # Skip check for gen-only requests - if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only: - max_num_tokens = self.args.max_num_tokens - if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens: - raise ValueError( - f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) should not exceed " - f"max_num_tokens ({max_num_tokens})") return build_config = self.args.build_config @@ -568,7 +560,7 @@ def _check_arguments(self, prompt_len: int, query_len: int, (sampling_params.max_tokens or 0) > max_seq_len): raise ValueError( f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}) and query length ({query_len}) max_tokens ({sampling_params.max_tokens}) should not exceed " - f"max_seq_len ({max_seq_len})") + f"max_seq_len ({build_config.max_seq_len})") if sampling_params.use_beam_search and sampling_params.best_of > build_config.max_beam_width: if sampling_params.n == sampling_params.best_of: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 61f8c199e9d..fb46cd337e8 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -647,7 +647,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, if torch_compile and mtp_nextn > 0: pytest.skip("https://nvbugs/5252313") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph, @@ -687,7 +687,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pytest.skip("https://nvbugs/5252313") if torch_compile and pp_size > 1: pytest.skip("PP with torch.compile is not supported yet.") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph and not attention_dp, @@ -725,7 +725,7 @@ def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph, overlap_scheduler, torch_compile): if torch_compile and mtp != "disable": pytest.skip("https://nvbugs/5252313") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph, @@ -813,7 +813,7 @@ def test_cute_dsl_fp8_block_scales( @pytest.mark.skip_device_not_contain(["H100"]) @parametrize_with_ids("mtp_nextn", [0, 2]) def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -838,7 +838,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn): @parametrize_with_ids("attention_dp", [False, True]) def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn, attention_dp): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -879,7 +879,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pytest.skip("https://nvbugs/5252313") if torch_compile and pp_size > 1: pytest.skip("PP with torch.compile is not supported yet.") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph and not attention_dp, @@ -979,7 +979,7 @@ def test_cute_dsl_fp8_block_scales_4gpus( @pytest.mark.skip_less_device(4) @pytest.mark.skip_device_not_contain(["H100", "H200"]) def test_fp8_block_scales_4gpus_static_eplb(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) num_experts = 72 num_slots = 80 @@ -1070,7 +1070,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, torch_compile, mtp_nextn, moe_backend): if torch_compile and mtp_nextn > 0: pytest.skip("https://nvbugs/5252313") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph, @@ -1121,7 +1121,7 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, pytest.skip("PP with torch.compile is not supported yet.") if moe_backend == "TRTLLM" and get_sm_version() == 120: pytest.skip("MOE TRTLLM backend does not support SM version 120") - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp. torch_compile_config = TorchCompileConfig( enable_fullgraph=True, @@ -1178,7 +1178,7 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv, elif quant_dtype == "nvfp4": model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only" - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, enable_block_reuse=False) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, diff --git a/tests/integration/defs/triton_server/test_triton.py b/tests/integration/defs/triton_server/test_triton.py index c25d82d271b..44b95dddf5f 100644 --- a/tests/integration/defs/triton_server/test_triton.py +++ b/tests/integration/defs/triton_server/test_triton.py @@ -508,7 +508,7 @@ def test_cpp_unit_tests(tritonserver_test_root, test_name, llm_root): run_shell_command( f"cd {llm_root}/triton_backend/inflight_batcher_llm/build && " - f"cmake .. -DTRTLLM_DIR={llm_root} -DCMAKE_INSTALL_PREFIX=install/ -DBUILD_TESTS=ON -DUSE_CXX11_ABI=ON " + f"cmake .. -DTRTLLM_DIR={llm_root} -DCMAKE_INSTALL_PREFIX=install/ -DBUILD_TESTS=ON -DUSE_CXX11_ABI=ON -DTRITON_COMMON_REPO_TAG=r25.05 -DTRITON_CORE_REPO_TAG=r25.05 -DTRITON_THIRD_PARTY_REPO_TAG=r25.05 -DTRITON_BACKEND_REPO_TAG=r25.05 " "&& make -j8 install", llm_root) # Run the cpp unit tests diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml index d46287d629e..b8a846ccff6 100644 --- a/tests/integration/test_lists/test-db/l0_a100.yml +++ b/tests/integration/test_lists/test-db/l0_a100.yml @@ -14,6 +14,7 @@ l0_a100: backend: "pytorch" tests: - unittest/llmapi/test_llm_pytorch.py + - unittest/llmapi/test_mpi_session.py # generic tests - condition: ranges: system_gpu_count: @@ -27,7 +28,7 @@ l0_a100: stage: post_merge backend: tensorrt tests: - - unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others + - unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others - unittest/llmapi/test_llm_models.py -m "part1" - unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" - unittest/llmapi/test_llm.py -m "part0" diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index bbe1c1b8a27..0aa3e9e5fb8 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -15,5 +15,6 @@ l0_gb200_multi_nodes: tests: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index cc790ce4eb3..346aab5adf5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -83,7 +83,7 @@ full:B200_PCIe/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell) full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell) full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell) full:B200_PCIe/unittest/bindings SKIP (Disable for Blackwell) -full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) +full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) full:B200_PCIe/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell) full:B200_PCIe/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell) full:B200_PCIe/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell) @@ -155,7 +155,7 @@ full:B200/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell) full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell) full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell) full:B200/unittest/bindings SKIP (Disable for Blackwell) -full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) +full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell) full:B200/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell) full:B200/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell) full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell) diff --git a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py b/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py index 6f3a7e6320d..df8214c4a55 100644 --- a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py +++ b/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py @@ -100,7 +100,7 @@ def test_fp8_block_scale_gemm(dtype, m, k, n, inference_mode): output_expected = output_expected.to(torch.float) diff = calc_diff(output, output_expected) assert diff < 1e-3 - torch.testing.assert_close(output, output_expected, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(output, output_expected, atol=1e-2, rtol=1e-2) @pytest.mark.skipif( diff --git a/tests/unittest/llmapi/_test_remote_mpi_session.sh b/tests/unittest/llmapi/_test_remote_mpi_session.sh index 01eff4b2725..792ef70dc85 100644 --- a/tests/unittest/llmapi/_test_remote_mpi_session.sh +++ b/tests/unittest/llmapi/_test_remote_mpi_session.sh @@ -7,6 +7,6 @@ echo "Starting remote MPI session test with task: $task" echo "MPI processes: 2" # Add timeout to prevent infinite hanging -timeout 60 mpirun -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task +timeout 60 mpirun --allow-run-as-root -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task echo "Remote MPI session test completed" diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index bda6fdf3fed..ec9bac2c5b6 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -2090,36 +2090,24 @@ def success_path(): success_path() -def _test_llm_capture_request_error(pytorch_backend: bool, tp_size: int = 1): - llm_args_extra = {} - if pytorch_backend: - LLM_CLASS = LLM_torch - llm_args_extra["max_num_tokens"] = 64 - else: - LLM_CLASS = LLM - build_config = BuildConfig() - build_config.max_num_tokens = 64 - llm_args_extra["fast_build"] = True - llm_args_extra["build_config"] = build_config +def _test_llm_capture_request_error(tp_size: int = 1): + build_config = BuildConfig() + build_config.max_num_tokens = 64 - llm = LLM_CLASS( + llm = LLM( model=llama_model_path, - tensor_parallel_size=tp_size, - **llm_args_extra, + build_config=build_config, + fast_build=True, ) prompt = 'A ' * 65 # the minimum max_num_tokens is 64 - if pytorch_backend: - # pytorch backend will raise ValueError for max_num_tokens - with pytest.raises(ValueError): - llm.generate(prompt) - else: - with pytest.raises(RequestError): - llm.generate(prompt) + + with pytest.raises(RequestError): + llm.generate(prompt) def test_llm_capture_request_error(): - _test_llm_capture_request_error(pytorch_backend=False, tp_size=1) + _test_llm_capture_request_error(tp_size=1) def test_llm_shutdown_executor(): diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index 40e657e7894..ecddfbe6a04 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -466,7 +466,7 @@ def test_llm_get_stats_async_tp2(pytorch_backend): def test_llm_capture_request_error(): - _test_llm_capture_request_error(pytorch_backend=False, tp_size=2) + _test_llm_capture_request_error(tp_size=2) def test_llm_with_postprocess_parallel_tp2(): diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index cb8dbf03c07..38b9e56d086 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -7,17 +7,11 @@ from tensorrt_llm.lora_manager import LoraConfig from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness -from .test_llm import _test_llm_capture_request_error # isort: on global_kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) -@pytest.mark.gpu2 -def test_llm_capture_request_error(): - _test_llm_capture_request_error(pytorch_backend=True, tp_size=2) - - @pytest.mark.gpu4 def test_tinyllama_logits_processor_tp2pp2(): tinyllama_logits_processor_test_harness(backend="pytorch", diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index dd6d2b4be31..486ceb301f5 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -6,11 +6,12 @@ # isort: off from .lora_test_utils import check_llama_7b_multi_unique_lora_adapters_from_request -from .test_llm import ( - get_model_path, global_kvcache_config, llama_model_path, - llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, - run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, - tinyllama_logits_processor_test_harness, _test_llm_capture_request_error) +from .test_llm import (get_model_path, global_kvcache_config, llama_model_path, + llm_get_stats_async_test_harness, + llm_get_stats_test_harness, prompts, + run_llm_abort_request, + run_llm_with_postprocess_parallel_and_result_handler, + tinyllama_logits_processor_test_harness) from utils.util import (EnvVarsContextManager, force_ampere, run_function_in_sub_process, similar, skip_gpu_memory_less_than_40gb, @@ -69,10 +70,6 @@ def test_llm_get_stats_async(return_context_logits, use_overlap, enable_iter_req_stats=enable_iter_req_stats) -def test_llm_capture_request_error(): - _test_llm_capture_request_error(pytorch_backend=True, tp_size=1) - - @force_ampere @pytest.mark.parametrize( "sampling_params", diff --git a/tests/unittest/llmapi/test_mpi_session.py b/tests/unittest/llmapi/test_mpi_session.py index ae8b0eba7a0..484caf7381e 100644 --- a/tests/unittest/llmapi/test_mpi_session.py +++ b/tests/unittest/llmapi/test_mpi_session.py @@ -60,13 +60,15 @@ def test_remote_mpi_session(task_type: Literal["submit", "submit_sync"]): """Test RemoteMpiPoolSessionClient and RemoteMpiPoolSessionServer interaction""" command = ["bash", "_test_remote_mpi_session.sh", task_type] print(' '.join(command)) + with Popen(command, env=os.environ, stdout=PIPE, stderr=PIPE, bufsize=1, start_new_session=True, - universal_newlines=True) as process: + universal_newlines=True, + cwd=os.path.dirname(os.path.abspath(__file__))) as process: # Function to read from a stream and write to output def read_stream(stream, output_stream): diff --git a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py index db3093a5b47..3523dff6819 100644 --- a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py +++ b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py @@ -82,7 +82,7 @@ def _parse_log_file(self, filename): return json.loads(json_string) - def _parse_triton_metrics(self, filename, is_v1): + def _parse_triton_metrics(self, filename): curl_counts = {} with open(filename) as metrics_file: for line in metrics_file: @@ -91,12 +91,11 @@ def _parse_triton_metrics(self, filename, is_v1): metric_output = re.sub(r"^.*?{", "{", line).split() metric_key = metric_output[0] metric_value = metric_output[1] - key = self._convert_metric_key_to_stats_key( - metric_key, is_v1) + key = self._convert_metric_key_to_stats_key(metric_key) curl_counts[key] = metric_value return curl_counts - def _convert_metric_key_to_stats_key(self, metric_output, is_v1): + def _convert_metric_key_to_stats_key(self, metric_output): # Converts: # '{model="tensorrt_llm",request_type="context",version="1"}' # to: @@ -107,15 +106,12 @@ def _convert_metric_key_to_stats_key(self, metric_output, is_v1): if not i.startswith('model') and not i.startswith('version') ][0] self.assertIn(key, metric_to_stat_dict) - if (is_v1): - self.assertNotIn("inflight_batcher_specific_metric", key) - else: - self.assertNotIn("v1_specific_metric", key) + self.assertNotIn("v1_specific_metric", key) return metric_to_stat_dict[key] - def _base_test(self, stats_file, metrics_file, is_v1): + def _base_test(self, stats_file, metrics_file): stats = self._parse_log_file(stats_file) - metrics = self._parse_triton_metrics(metrics_file, is_v1) + metrics = self._parse_triton_metrics(metrics_file) self.assertEqual(len(stats.keys()), len(metrics.keys())) self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort()) for metric_key in stats.keys(): @@ -140,45 +136,33 @@ def _base_test(self, stats_file, metrics_file, is_v1): timedelta(seconds=-1) <= difference, difference <= timedelta(seconds=1)) - def test_1_gpu_v1(self): - self._base_test("1gpu_v1_no_streaming_server.log", - "1gpu_v1_no_stream_metrics.out", True) - def test_1_gpu_IFB_no_stream(self): self._base_test("1gpu_IFB_no_streaming_server.log", - "1gpu_IFB_no_stream_metrics.out", False) + "1gpu_IFB_no_stream_metrics.out") def test_1_gpu_IFB_stream(self): self._base_test("1gpu_IFB_streaming_server.log", - "1gpu_IFB_stream_metrics.out", False) + "1gpu_IFB_stream_metrics.out") if AVAILABLE_GPUS >= 2: - def test_2_gpu_v1(self): - self._base_test("2gpu_v1_no_streaming_server.log", - "2gpu_v1_no_stream_metrics.out", True) - def test_2_gpu_IFB_no_stream(self): self._base_test("2gpu_IFB_no_streaming_server.log", - "2gpu_IFB_no_stream_metrics.out", False) + "2gpu_IFB_no_stream_metrics.out") def test_2_gpu_IFB_stream(self): self._base_test("2gpu_IFB_streaming_server.log", - "2gpu_IFB_stream_metrics.out", False) + "2gpu_IFB_stream_metrics.out") if AVAILABLE_GPUS >= 4: - def test_4_gpu_v1(self): - self._base_test("4gpu_v1_no_streaming_server.log", - "4gpu_v1_no_stream_metrics.out", True) - def test_4_gpu_IFB_no_stream(self): self._base_test("4gpu_IFB_no_streaming_server.log", - "4gpu_IFB_no_stream_metrics.out", False) + "4gpu_IFB_no_stream_metrics.out") def test_4_gpu_IFB_stream(self): self._base_test("4gpu_IFB_streaming_server.log", - "4gpu_IFB_stream_metrics.out", False) + "4gpu_IFB_stream_metrics.out") if __name__ == "__main__": diff --git a/triton_backend/ci/L0_backend_trtllm/test.sh b/triton_backend/ci/L0_backend_trtllm/test.sh index c09e985a266..83967d1c58c 100644 --- a/triton_backend/ci/L0_backend_trtllm/test.sh +++ b/triton_backend/ci/L0_backend_trtllm/test.sh @@ -228,49 +228,13 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do run_server "${SERVER_ARGS}" wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} - RET=1 - fi - set +e - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - if [ $? -ne 0 ]; then + # Expect invalid GPT model type error to be gracefully handled + if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***" cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} - RET=1 + exit 1 fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} # inflight batching ON # streaming OFF diff --git a/triton_backend/inflight_batcher_llm/scripts/build.sh b/triton_backend/inflight_batcher_llm/scripts/build.sh index 8aafc4b0f81..d077746bb51 100644 --- a/triton_backend/inflight_batcher_llm/scripts/build.sh +++ b/triton_backend/inflight_batcher_llm/scripts/build.sh @@ -51,7 +51,8 @@ if [[ "$BUILD_UNIT_TESTS" == "true" ]]; then BUILD_TESTS_ARG="-DBUILD_TESTS=ON -DUSE_CXX11_ABI=ON" fi -cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} .. +# TODO: Remove specifying Triton version after cmake version is upgraded to 3.31.8 +cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} -DTRITON_COMMON_REPO_TAG=r25.05 -DTRITON_CORE_REPO_TAG=r25.05 -DTRITON_THIRD_PARTY_REPO_TAG=r25.05 -DTRITON_BACKEND_REPO_TAG=r25.05 .. make install mkdir -p /opt/tritonserver/backends/tensorrtllm