diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index 2cf4204d2f6..2b0df0f9066 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -28,101 +28,119 @@ nvidia/Llama-3.1-405B-Instruct-FP4 ``` #### Llama 3.3 70B FP4 + | | GPU | B200 | | | | -|:-----------------------------|:---|:----------|:----------|:----------|:----------| -| | TP Size | 1 | 2 | 4 | 8 | -| ISL, OSL| | | | | | -| | | | | | | -| 128, 128 | | 11,253.28 | 17,867.66 | 24,944.50 | 27,471.49 | -| 128, 2048 | | 9,925.00 | 15,459.71 | 23,608.58 | 30,742.86 | -| 128, 4096 | | 6,318.92 | 8,711.88 | 17,659.74 | 24,947.05 | -| 500, 2000 | | 7,559.88 | 10,602.27 | 20,910.23 | 28,182.34 | -| 1000, 1000 | | 6,866.96 | 10,838.01 | 16,567.86 | 19,991.64 | -| 1000, 2000 | | 6,736.88 | 9,132.08 | 15,737.02 | 20,518.04 | -| 1024, 2048 | | 6,580.56 | 8,767.45 | 15,722.55 | 20,437.96 | -| 2048, 128 | | 1,375.49 | 1,610.69 | 2,707.58 | 3,717.82 | -| 2048, 2048 | | 4,544.73 | 6,956.14 | 12,292.23 | 15,661.22 | -| 5000, 500 | | 1,488.19 | 2,379.73 | 3,588.45 | 4,810.21 | -| 20000, 2000 | | 580.96 | 1,043.58 | 1,957.84 | 3,167.30 | +|:------------------------|:--------|:----------|:----------|:----------|:----------| +| | TP Size | 1 | 2 | 4 | 8 | +| ISL, OSL | | | | | | +| | | | | | | +| 128, 128 | | 10,994.48 | 17,542.11 | 24,667.31 | 27,272.27 | +| 128, 2048 | | 9,580.46 | 15,432.35 | 23,568.12 | 31,174.31 | +| 128, 4096 | | 6,418.39 | 9,841.53 | 17,808.76 | 25,229.25 | +| 500, 2000 | | 7,343.32 | 11,850.57 | 20,709.67 | 28,038.78 | +| 1000, 1000 | | 6,752.53 | 10,815.88 | 16,413.04 | 20,060.66 | +| 1000, 2000 | | 6,670.07 | 9,830.73 | 15,597.49 | 20,672.37 | +| 1024, 2048 | | 6,636.75 | 9,807.13 | 15,519.23 | 20,617.28 | +| 2048, 128 | | 1,342.17 | 1,989.41 | 3,033.14 | 4,035.64 | +| 5000, 500 | | 1,429.67 | 2,419.67 | 3,686.84 | 5,182.96 | +| 20000, 2000 | | 629.77 | 1,177.01 | 2,120.66 | 3,429.03 | #### Llama 3.1 405B FP4 -| | GPU | B200 | -|:-----------------------------|:---|:----------| -| | TP Size | 8 | -| ISL, OSL| | | -| | | | -| 128, 128 | | 9,184.83 | -| 128, 2048 | | 10,387.23 | -| 128, 4096 | | 8,741.80 | -| 500, 2000 | | 9,242.34 | -| 1000, 1000 | | 7,565.50 | -| 1000, 2000 | | 7,696.76 | -| 1024, 2048 | | 7,568.93 | -| 2048, 128 | | 953.57 | -| 2048, 2048 | | 6,092.32 | -| 5000, 500 | | 1,332.22 | -| 20000, 2000 | | 961.58 | + +| | GPU | B200 | | +|:------------------------|:------- |:---------|:----------| +| | TP Size | 4 | 8 | +| ISL, OSL | | | | +| | | | | +| 128, 128 | | 6,163.81 | 9,002.90 | +| 128, 2048 | | 7,081.21 | 10,288.28 | +| 128, 4096 | | 6,028.37 | 8,713.77 | +| 500, 2000 | | 5,858.75 | 9,125.86 | +| 1000, 1000 | | 4,848.00 | 7,582.97 | +| 1000, 2000 | | 5,375.25 | 7,626.28 | +| 1024, 2048 | | 5,345.70 | 7,464.03 | +| 2048, 128 | | 693.55 | 1,086.56 | +| 5000, 500 | | 947.49 | 1,532.45 | +| 20000, 2000 | | 641.11 | 1,097.84 | ### FP8 Models: ``` nvidia/Llama-3.1-8B-Instruct-FP8 -nvidia/Llama-3.1-70B-Instruct-FP8 +nvidia/Llama-3.3-70B-Instruct-FP8 nvidia/Llama-3.1-405B-Instruct-FP8 +nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 ``` #### Llama 3.1 8B FP8 -| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | + +| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | |:-----------------------------|:---|:------------------|:-----------------| -| | TP Size | 1 | 1 | +| | TP Size | 1 | 1 | | ISL, OSL | | | | | | | | | -| 128, 128 | | 28,447.38 | 27,568.68 | -| 128, 2048 | | 23,294.74 | 22,003.62 | -| 128, 4096 | | 17,481.48 | 13,640.35 | -| 500, 2000 | | 21,462.57 | 17,794.39 | -| 1000, 1000 | | 17,590.60 | 15,270.02 | -| 1000, 2000 | | 17,139.51 | 13,850.22 | -| 1024, 2048 | | 16,970.63 | 13,374.15 | -| 2048, 128 | | 3,531.33 | 3,495.05 | -| 2048, 2048 | | 12,022.38 | 9,653.67 | -| 5000, 500 | | 3,851.65 | 3,371.16 | -| 20000, 2000 | | 1,706.06 | 1,340.92 | - -#### Llama 3.1 70B FP8 -| | GPU | H200 141GB HBM3 | | | | H100 80GB HBM3 | | | | +| 128, 128 | | 27,970.14 | 27,688.36 | +| 128, 2048 | | 23,326.38 | 21,841.15 | +| 128, 4096 | | 17,508.51 | 13,730.89 | +| 500, 2000 | | 21,390.41 | 17,833.34 | +| 1000, 1000 | | 17,366.89 | 15,270.62 | +| 1000, 2000 | | 16,831.31 | 13,798.08 | +| 1024, 2048 | | 16,737.03 | 13,385.50 | +| 2048, 128 | | 3,488.03 | 3,414.67 | +| 5000, 500 | | 3,813.69 | 3,394.54 | +| 20000, 2000 | | 1,696.66 | 1,345.42 | + +#### Llama 3.3 70B FP8 + +| | GPU | H200 141GB HBM3 | | | | H100 80GB HBM3 | | | | |:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------| -| | TP Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | -| ISL, OSL| | | | | | | | | | +| | TP Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | +| ISL, OSL | | | | | | | | | | | | | | | | | | | | | -| 128, 128 | | 3,657.58 | 6,477.50 | 10,466.04 | 15,554.57 | 3,191.27 | 6,183.41 | 10,260.68 | 14,686.01 | -| 128, 2048 | | 4,351.07 | 8,450.31 | 13,438.71 | 20,750.58 | 745.19 | 5,822.02 | 11,442.01 | 17,463.99 | -| 128, 4096 | | 2,696.61 | 5,598.92 | 11,524.93 | 16,634.90 | | 3,714.87 | 8,209.91 | 12,598.55 | -| 500, 2000 | | 3,475.58 | 6,712.35 | 12,332.32 | 17,311.28 | | 4,704.31 | 10,278.02 | 14,630.41 | -| 1000, 1000 | | 2,727.42 | 5,097.36 | 8,698.15 | 12,794.92 | 734.67 | 4,191.26 | 7,427.35 | 11,082.48 | -| 1000, 2000 | | 2,913.54 | 5,841.15 | 9,016.49 | 13,174.68 | 526.31 | 3,920.44 | 7,590.35 | 11,108.11 | -| 1024, 2048 | | 2,893.02 | 5,565.28 | 9,017.72 | 13,117.34 | 525.43 | 3,896.14 | 7,557.32 | 11,028.32 | -| 2048, 128 | | 433.30 | 772.97 | 1,278.26 | 1,947.33 | 315.90 | 747.51 | 1,240.12 | 1,840.12 | -| 2048, 2048 | | 1,990.25 | 3,822.83 | 7,068.68 | 10,529.06 | 357.98 | 2,732.86 | 5,640.31 | 8,772.88 | -| 5000, 500 | | 543.88 | 1,005.81 | 1,714.77 | 2,683.22 | 203.27 | 866.77 | 1,571.92 | 2,399.78 | -| 20000, 2000 | | 276.99 | 618.01 | 1,175.35 | 2,021.08 | | 408.43 | 910.77 | 1,568.84 | +| 128, 128 | | 3,605.47 | 6,427.69 | 10,407.42 | 15,434.37 | 3,128.33 | 6,216.91 | | | +| 128, 2048 | | 4,315.80 | 8,464.03 | 13,508.59 | 20,759.72 | 756.42 | 5,782.57 | 11,464.94 | 17,424.32 | +| 128, 4096 | | 2,701.17 | 5,573.55 | 11,458.56 | 16,668.75 | | 3,868.37 | 8,206.39 | 12,624.61 | +| 500, 2000 | | 3,478.76 | 6,740.06 | 12,200.18 | | | 4,684.06 | 9,903.53 | 14,553.93 | +| 1000, 1000 | | 2,744.32 | 5,119.72 | 8,685.44 | 12,744.51 | 742.14 | 4,247.19 | 7,435.65 | 11,018.81 | +| 1000, 2000 | | 2,896.44 | 5,847.26 | 9,031.21 | 13,141.17 | 533.74 | 3,866.53 | 7,611.12 | 11,139.22 | +| 1024, 2048 | | 2,874.18 | 5,568.61 | 8,946.71 | 13,082.62 | 530.16 | 3,796.68 | 7,575.24 | 11,004.31 | +| 2048, 128 | | 435.90 | 772.67 | 1,264.76 | | | 736.89 | 1,213.33 | 1,839.22 | +| 2048, 2048 | | | | | 10,412.85 | | | | | +| 5000, 500 | | 545.96 | 997.15 | 1,698.22 | 2,655.28 | 204.94 | 862.91 | 1,552.68 | 2,369.84 | +| 20000, 2000 | | 276.66 | 620.33 | 1,161.29 | 1,985.85 | | 416.13 | 903.66 | 1,554.10 | #### Llama 3.1 405B FP8 -| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | + +| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | |:-----------------------------|:---|:------------------|:-----------------| -| | TP Size | 8 | 8 | +| | TP Size | 8 | 8 | | ISL, OSL | | | | | | | | | -| 128, 128 | | 3,800.11 | 3,732.40 | -| 128, 2048 | | 5,661.13 | 4,572.23 | -| 128, 4096 | | 5,167.18 | 2,911.42 | -| 500, 2000 | | 4,854.29 | 3,661.85 | -| 1000, 1000 | | 3,332.15 | 2,963.36 | -| 1000, 2000 | | 3,682.15 | 3,253.17 | -| 1024, 2048 | | 3,685.56 | 3,089.16 | -| 2048, 128 | | 453.42 | 448.89 | -| 2048, 2048 | | 3,055.73 | 2,139.94 | -| 5000, 500 | | 656.11 | 579.14 | -| 20000, 2000 | | 514.02 | 370.26 | +| 128, 2048 | | 5,567.87 | | +| 128, 4096 | | 5,136.85 | | +| 500, 2000 | | 4,787.61 | 3,673.91 | +| 1000, 1000 | | 3,286.30 | 3,012.22 | +| 1000, 2000 | | 3,636.76 | 3,262.20 | +| 1024, 2048 | | 3,618.66 | 3,109.70 | +| 2048, 128 | | 443.10 | 449.02 | +| 5000, 500 | | 645.46 | | +| 20000, 2000 | | | 372.12 | + +#### Llama 4 Maverick FP8 + +| | GPU | H200 141GB HBM3 | H100 80GB HBM3 | +|:-----------------------------|:---|:------------------|:-----------------| +| | TP Size | 8 | 8 | +| ISL, OSL | | | | +| | | | | +| 128, 2048 | | 27,543.87 | | +| 128, 4096 | | 18,541.01 | 11,163.12 | +| 500, 2000 | | 21,117.34 | | +| 1000, 2000 | | | 10,556.00 | +| 1024, 2048 | | 16,859.45 | 11,584.33 | +| 2048, 128 | | 4,364.06 | 3,832.38 | +| 2048, 2048 | | 12,800.89 | | +| 5000, 500 | | 5,128.60 | | +| 20000, 2000 | | 1,764.27 | 1,400.79 | ## Reproducing Benchmarked Results @@ -198,6 +216,8 @@ a model name (HuggingFace reference or path to a local model), a [generated data trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options ``` +The data collected for the v0.20 benchmarks was run with the following file: + `llm_options.yml` ```yaml use_cuda_graph: true @@ -220,7 +240,7 @@ cuda_graph_batch_sizes: - 8192 ``` -In majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue. +In a majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue. The results will be printed to the terminal upon benchmark completion. For example, diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md index d7acb4a0be6..53519e61047 100644 --- a/docs/source/quick-start-guide.md +++ b/docs/source/quick-start-guide.md @@ -14,6 +14,9 @@ There are multiple ways to install and run TensorRT-LLM. For most users, the opt 1. [Building from source](installation/build-from-source-linux) +The following examples can most easily be executed using the prebuilt [Docker release container available on NGC](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/tensorrt-llm/containers/release) (see also [release.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/release.md) on GitHub). Ensure to run these commands as a user with appropriate permissions, preferably `root`, to streamline the setup process. + + ## LLM API The LLM API is a Python API designed to facilitate setup and inference with TensorRT-LLM directly within Python. It enables model optimization by simply specifying a HuggingFace repository name or a model checkpoint. The LLM API streamlines the process by managing checkpoint conversion, engine building, engine loading, and model inference, all through a single Python object. @@ -89,7 +92,7 @@ For detailed examples and command syntax, refer to the [trtllm-serve](commands/t 2. Open a new terminal and use the following command to directly attach to the running container: -```bash +```bash:docs/source/quick-start-guide.md docker exec -it bash ``` diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md index bb663aba7d2..d5c239b82e4 100644 --- a/docs/source/release-notes.md +++ b/docs/source/release-notes.md @@ -4,6 +4,82 @@ All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/). +## TensorRT-LLM Release 0.20.0 + +### Key Features and Enhancements +- **Model Support** + - Added Qwen3 support.Refer to “Qwen3” section in `examples/models/core/qwen/README.md`. + - Added HyperCLOVAX-SEED-Vision support in PyTorch flow. Refer to `examples/models/contrib/hyperclovax/README.md` + - Added Dynasor-CoT in scaffolding examples. Refer to `examples/scaffolding/contrib/Dynasor/README.md` + - Added Mistral Small 3.1 24B VLM support in TRT workflow + - Added Gemma3-1b-it support in PyTorch workflow + - Added Nemotron-H model support + - Added Eagle-3 support for LLAMA4 +- **PyTorch workflow** + - Added lora support + - Added return logits support + - Adopt new logprob definition in PyTorch flow + - Enabled per-request stats with PyTorch backend + - Enabled LogitsProcessor in PyTorch backend +- Benchmark: + - Add beam width to low latency. + - Fix trtllm-bench iter_stats and cuda_graph_batch_sizes errors. + - Remove deprecated Python runtime benchmark + - Add benchmark support for scaffolding +- Multimodal models + - Added support in trtllm-serve + - Added support in trtllm-bench, the support is limited to image only for now +- Supported DeepSeek-R1 W4A8 on Hopper +- Add the RTX Pro 6000 support on single GPU +- Integrated Llama4 input processor +- Added CGA reduction FHMA kernels on Blackwell +- Enabled chunked context for FlashInfer +- Supported KV cache reuse for MLA +- Added Piecewise CUDA Graph support +- Supported multiple LoRA adapters and TP +- Added KV cache-aware router for disaggregated serving +- Unfused attention for native support +- Added group_rms_norm kernel to normalize multiple inputs in a single operator +- Added smart router for the MoE module +- Added head size 72 support for QKV preprocessing kernel +- Added MNNVL MoE A2A support +- Optimized Large Embedding Tables in Multimodal Models +- Supported Top-K logprobs and prompt_logprobs in LLMAPI +- Enabled overlap scheduler in TRT workflow via executor API + +### Infrastructure Changes +- **TRT-LLM team formally releases docker image on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)**. +- The pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 now, which uses the CXX11 ABI +- The dependent TensorRT version is updated to 10.10.0 +- The dependent CUDA version is updated to 12.9.0 +- The dependent public PyTorch version is updated to 2.7.0 +- The dependent NVIDIA ModelOpt version is updated to 0.29.0 +- The dependent NCCL version is maintained at 2.25.1 +- Open-sourced XQA kernels +- Dependent datasets version was upgraded to 3.1.0 +- Migrate Triton Backend to TensorRT LLM repo to TensorRT LLM submodule +- Downgrade gcc toolset version from 13 to 11 + +### API Changes +- [Breaking Change]:Enable scheduling overlap by default +- Remove deprecated GptSession/V1 from TRT workflow +- Set _AutoDeployLlmArgs as primary config object +- Allow overriding CLI arguments with YAML file in trtllm-serve +- Introduced multimodal embedding field in LlmRequest + + +### Fixed Issues +- Fix hang bug when context server doesn't have enough capacity for KV Cache (#3095) +- Fix C++ decoder synchronization in PyTorch (#3106) +- Fix bug of create cuda stream as default parameter which will be initialized during importing (#3764) +- Fix bug related to creating CUDA stream as default parameter, which will be initialized during importing (#3764) +- Fix attention DP bug on Qwen3 MoE model (#4141) +- Fix illegal memory access when running LLaMA 4 with CUDA Graph enabled (#4101) +- Reset planned states to avoid memory leak in TrtllmAttentionWrapper (#4227) + +### Known Issues +- multi-GPU model support on RTX Pro 6000 + ## TensorRT-LLM Release 0.19.0 diff --git a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py index db3093a5b47..3523dff6819 100644 --- a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py +++ b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py @@ -82,7 +82,7 @@ def _parse_log_file(self, filename): return json.loads(json_string) - def _parse_triton_metrics(self, filename, is_v1): + def _parse_triton_metrics(self, filename): curl_counts = {} with open(filename) as metrics_file: for line in metrics_file: @@ -91,12 +91,11 @@ def _parse_triton_metrics(self, filename, is_v1): metric_output = re.sub(r"^.*?{", "{", line).split() metric_key = metric_output[0] metric_value = metric_output[1] - key = self._convert_metric_key_to_stats_key( - metric_key, is_v1) + key = self._convert_metric_key_to_stats_key(metric_key) curl_counts[key] = metric_value return curl_counts - def _convert_metric_key_to_stats_key(self, metric_output, is_v1): + def _convert_metric_key_to_stats_key(self, metric_output): # Converts: # '{model="tensorrt_llm",request_type="context",version="1"}' # to: @@ -107,15 +106,12 @@ def _convert_metric_key_to_stats_key(self, metric_output, is_v1): if not i.startswith('model') and not i.startswith('version') ][0] self.assertIn(key, metric_to_stat_dict) - if (is_v1): - self.assertNotIn("inflight_batcher_specific_metric", key) - else: - self.assertNotIn("v1_specific_metric", key) + self.assertNotIn("v1_specific_metric", key) return metric_to_stat_dict[key] - def _base_test(self, stats_file, metrics_file, is_v1): + def _base_test(self, stats_file, metrics_file): stats = self._parse_log_file(stats_file) - metrics = self._parse_triton_metrics(metrics_file, is_v1) + metrics = self._parse_triton_metrics(metrics_file) self.assertEqual(len(stats.keys()), len(metrics.keys())) self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort()) for metric_key in stats.keys(): @@ -140,45 +136,33 @@ def _base_test(self, stats_file, metrics_file, is_v1): timedelta(seconds=-1) <= difference, difference <= timedelta(seconds=1)) - def test_1_gpu_v1(self): - self._base_test("1gpu_v1_no_streaming_server.log", - "1gpu_v1_no_stream_metrics.out", True) - def test_1_gpu_IFB_no_stream(self): self._base_test("1gpu_IFB_no_streaming_server.log", - "1gpu_IFB_no_stream_metrics.out", False) + "1gpu_IFB_no_stream_metrics.out") def test_1_gpu_IFB_stream(self): self._base_test("1gpu_IFB_streaming_server.log", - "1gpu_IFB_stream_metrics.out", False) + "1gpu_IFB_stream_metrics.out") if AVAILABLE_GPUS >= 2: - def test_2_gpu_v1(self): - self._base_test("2gpu_v1_no_streaming_server.log", - "2gpu_v1_no_stream_metrics.out", True) - def test_2_gpu_IFB_no_stream(self): self._base_test("2gpu_IFB_no_streaming_server.log", - "2gpu_IFB_no_stream_metrics.out", False) + "2gpu_IFB_no_stream_metrics.out") def test_2_gpu_IFB_stream(self): self._base_test("2gpu_IFB_streaming_server.log", - "2gpu_IFB_stream_metrics.out", False) + "2gpu_IFB_stream_metrics.out") if AVAILABLE_GPUS >= 4: - def test_4_gpu_v1(self): - self._base_test("4gpu_v1_no_streaming_server.log", - "4gpu_v1_no_stream_metrics.out", True) - def test_4_gpu_IFB_no_stream(self): self._base_test("4gpu_IFB_no_streaming_server.log", - "4gpu_IFB_no_stream_metrics.out", False) + "4gpu_IFB_no_stream_metrics.out") def test_4_gpu_IFB_stream(self): self._base_test("4gpu_IFB_streaming_server.log", - "4gpu_IFB_stream_metrics.out", False) + "4gpu_IFB_stream_metrics.out") if __name__ == "__main__": diff --git a/triton_backend/ci/L0_backend_trtllm/test.sh b/triton_backend/ci/L0_backend_trtllm/test.sh index c09e985a266..83967d1c58c 100644 --- a/triton_backend/ci/L0_backend_trtllm/test.sh +++ b/triton_backend/ci/L0_backend_trtllm/test.sh @@ -228,49 +228,13 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do run_server "${SERVER_ARGS}" wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} - RET=1 - fi - set +e - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - if [ $? -ne 0 ]; then + # Expect invalid GPT model type error to be gracefully handled + if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***" cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} - RET=1 + exit 1 fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} # inflight batching ON # streaming OFF