Skip to content

Commit 92493ef

Browse files
authored
Update TensorRT-LLM backend (#322)
1 parent 4344654 commit 92493ef

File tree

7 files changed

+104
-328
lines changed

7 files changed

+104
-328
lines changed

README.md

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -60,35 +60,49 @@ Starting with Triton 23.10 release, you can follow steps described in the
6060
[Building With Docker](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/build.md#building-with-docker)
6161
guide and use the
6262
[build.py](https://github.com/triton-inference-server/server/blob/main/build.py)
63-
script.
63+
script to build the TRT-LLM backend.
6464

65-
A sample command to build a Triton Server container with all options enabled is
66-
shown below, which will build the same TRT-LLM container as the one on the NGC.
65+
The below commands will build the same Triton TRT-LLM container as the one on the NGC.
6766

6867
```bash
69-
BASE_CONTAINER_IMAGE_NAME=nvcr.io/nvidia/tritonserver:23.10-py3-min
70-
TENSORRTLLM_BACKEND_REPO_TAG=release/0.5.0
71-
PYTHON_BACKEND_REPO_TAG=r23.10
72-
73-
# Run the build script. The flags for some features or endpoints can be removed if not needed.
68+
# Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
69+
cd tensorrtllm_backend
70+
# Specify the build args for the dockerfile.
71+
BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.01-py3-min
72+
TRT_VERSION=9.2.0.5
73+
TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.linux.x86_64-gnu.cuda-12.2.tar.gz
74+
TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.Ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz
75+
76+
docker build -t trtllm_base
77+
--build-arg BASE_IMAGE="${BASE_IMAGE}"
78+
--build-arg TRT_VER="${TRT_VERSION}"
79+
--build-arg RELEASE_URL_TRT_x86="${TRT_URL_x86}"
80+
--build-arg RELEASE_URL_TRT_ARM="${TRT_URL_ARM}"
81+
-f dockerfile/Dockerfile.triton.trt_llm_backend .
82+
83+
# Run the build script from Triton Server repo. The flags for some features or
84+
# endpoints can be removed if not needed. Please refer to the support matrix to
85+
# see the aligned versions: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
86+
TRTLLM_BASE_IMAGE=trtllm_base
87+
TENSORRTLLM_BACKEND_REPO_TAG=v0.7.2
88+
PYTHON_BACKEND_REPO_TAG=r24.01
89+
90+
cd server
7491
./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \
7592
--enable-metrics --enable-gpu-metrics --enable-cpu-metrics \
7693
--filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
7794
--endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
7895
--backend=ensemble --enable-gpu --endpoint=http --endpoint=grpc \
79-
--image=base,${BASE_CONTAINER_IMAGE_NAME} \
96+
--image=base,${TRTLLM_BASE_IMAGE} \
8097
--backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
8198
--backend=python:${PYTHON_BACKEND_REPO_TAG}
8299
```
83100

84-
The `BASE_CONTAINER_IMAGE_NAME` is the base image that will be used to build the
85-
container. By default it is set to the most recent min image of Triton, on NGC,
86-
that matches the Triton release you are building for. You can change it to a
87-
different image if needed by setting the `--image` flag like the command below.
88-
The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are the tags of
89-
the TensorRT-LLM backend and Python backend repositories that will be used
90-
to build the container. You can also remove the features or endpoints that you
91-
don't need by removing the corresponding flags.
101+
The `TRTLLM_BASE_IMAGE` is the base image that will be used to build the
102+
container. The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are
103+
the tags of the TensorRT-LLM backend and Python backend repositories that will
104+
be used to build the container. You can also remove the features or endpoints
105+
that you don't need by removing the corresponding flags.
92106

93107
### Option 3. Build via Docker
94108

ci/L0_backend_trtllm/test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py
3939
CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log"
4040
SERVER_PID=0
4141

42+
# Force environment to use python version 3
43+
apt update -q=2 \
44+
&& apt install -y python-is-python3
45+
4246
# Helpers ===============================
4347
function replace_config_tags {
4448
tag_to_replace="${1}"
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
ARG BASE_IMAGE
2+
3+
FROM ${BASE_IMAGE} as base
4+
5+
RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip
6+
# Remove previous TRT installation
7+
# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
8+
RUN apt-get remove -y tensorrt*
9+
RUN pip3 uninstall -y tensorrt
10+
11+
ARG TRT_VER
12+
13+
ENV TRT_VERSION=$TRT_VER \
14+
TRT_VER=$TRT_VER \
15+
CUDA_VER=$CUDA_VERSION \
16+
CUDNN_VER=$CUDNN_VERSION \
17+
NCCL_VER=$NCCL_VERSION \
18+
CUBLAS_VER=$CUBLAS_VERSION
19+
20+
LABEL TRT_VERSION $TRT_VER
21+
22+
# Download & install internal TRT release
23+
RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
24+
&& curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
25+
&& apt install /tmp/cuda-keyring.deb \
26+
&& rm /tmp/cuda-keyring.deb \
27+
&& apt-get update -q=2
28+
29+
ARG RELEASE_URL_TRT_x86
30+
ARG RELEASE_URL_TRT_ARM
31+
32+
RUN [ "$(uname -m)" != "x86_64" ] && RELEASE_URL_TRT=${RELEASE_URL_TRT_ARM} || RELEASE_URL_TRT=${RELEASE_URL_TRT_x86} \
33+
&& curl -fSL -o /tmp/tensorrt.tar.gz ${RELEASE_URL_TRT} \
34+
&& tar xzvf /tmp/tensorrt.tar.gz -C /usr/local \
35+
&& rm /tmp/tensorrt.tar.gz \
36+
&& find /usr/local -maxdepth 1 -name Tens* -type d -exec ln -s {} /usr/local/tensorrt \;
37+
38+
RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )*
39+
40+
ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
41+
ENV TRT_ROOT=/usr/local/tensorrt

inflight_batcher_llm/CMakeLists.txt

Lines changed: 27 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,7 @@ set(TRITON_BUILD
2828
OFF
2929
CACHE STRING "Using Triton build process")
3030

31-
if(TRITON_BUILD)
32-
set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm)
33-
# Install build time dependencies. This section is executed during cmake
34-
# configure time.
35-
execute_process(
36-
COMMAND bash -x ./tools/environment_setup.sh
37-
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
38-
RESULT_VARIABLE CMD_RESULT)
39-
if(NOT CMD_RESULT EQUAL "0")
40-
message(FATAL_ERROR "Failed to install build time dependencies")
41-
endif()
42-
else()
43-
set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
44-
endif()
31+
set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
4532

4633
include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
4734

@@ -64,16 +51,6 @@ if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS)
6451
FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON")
6552
endif()
6653

67-
# The TRTLLM_BUILD_CONTAINER is used to compile the TRT-LLM libraries that are
68-
# needed for the TRT-LLM backend. The TRTLLM_BUILD_CONTAINER is launched
69-
# separately, and the artifacts will be copied back to the backend installation
70-
# directory.
71-
if(TRITON_BUILD)
72-
set(TRTLLM_BUILD_CONTAINER
73-
""
74-
CACHE STRING "Base image for building TRT-LLM")
75-
endif()
76-
7754
set(TRITON_COMMON_REPO_TAG
7855
"main"
7956
CACHE STRING "Tag for triton-inference-server/common repo")
@@ -116,31 +93,6 @@ FetchContent_Declare(
11693
GIT_SHALLOW ON)
11794
FetchContent_MakeAvailable(repo-common repo-core repo-backend)
11895

119-
# Compile TRT-LLM
120-
if(TRITON_BUILD)
121-
set(TRITON_TRTLLM_DOCKER_NAME "tritonserver-trtllm")
122-
add_custom_command(
123-
OUTPUT tensorrt_llm_build
124-
COMMENT "Building TensorRT-LLM"
125-
COMMAND
126-
cd ${CMAKE_CURRENT_SOURCE_DIR} && python3 tools/gen_trtllm_dockerfile.py
127-
--trtllm-build-config="${CMAKE_BUILD_TYPE}"
128-
--trtllm-base-image="${TRTLLM_BUILD_CONTAINER}" --output=Dockerfile.trtllm
129-
COMMAND
130-
cd ${CMAKE_CURRENT_SOURCE_DIR} && docker build
131-
--cache-from=${TRITON_TRTLLM_DOCKER_NAME}
132-
--cache-from=${TRITON_TRTLLM_DOCKER_NAME}_cache0
133-
--cache-from=${TRITON_TRTLLM_DOCKER_NAME}_cache1 -t
134-
${TRITON_TRTLLM_DOCKER_NAME} -f ./Dockerfile.trtllm .
135-
COMMAND docker rm trtllm_build || echo 'error ignored...' || true
136-
COMMAND docker create --name trtllm_build ${TRITON_TRTLLM_DOCKER_NAME}
137-
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && rm -fr tensorrt_llm
138-
COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && docker cp
139-
trtllm_build:/app/tensorrt_llm tensorrt_llm
140-
COMMAND docker cp trtllm_build:/opt/trtllm_lib trtllm_build
141-
COMMAND docker rm trtllm_build)
142-
endif()
143-
14496
#
14597
# The backend must be built into a shared library. Use an ldscript to hide all
14698
# symbols except for the TRITONBACKEND API.
@@ -153,11 +105,6 @@ set(SRCS src/libtensorrtllm.cc src/work_item.cc src/work_items_queue.cc
153105

154106
add_library(triton-tensorrt-llm-backend SHARED ${SRCS})
155107

156-
if(TRITON_BUILD)
157-
add_custom_target(trtllm_target DEPENDS tensorrt_llm_build)
158-
add_dependencies(triton-tensorrt-llm-backend trtllm_target)
159-
endif()
160-
161108
add_library(TritonTensorRTLLMBackend::triton-tensorrt-llm-backend ALIAS
162109
triton-tensorrt-llm-backend)
163110

@@ -352,10 +299,25 @@ if(TRITON_ENABLE_METRICS)
352299
endif()
353300

354301
if(TRITON_BUILD)
355-
add_dependencies(tensorrt_llm trtllm_target)
356-
add_dependencies(tensorrt_llm_batch_manager trtllm_target)
357-
add_dependencies(nvinfer_plugin_tensorrt_llm trtllm_target)
358-
endif()
302+
303+
if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
304+
execute_process(
305+
WORKING_DIRECTORY ${TRTLLM_DIR}
306+
COMMAND bash -x docker/common/install_pytorch.sh pypi COMMAND_ECHO STDOUT
307+
COMMAND_ERROR_IS_FATAL ANY)
308+
else()
309+
execute_process(
310+
WORKING_DIRECTORY ${TRTLLM_DIR}
311+
COMMAND bash -x docker/common/install_pytorch.sh src_non_cxx11_abi
312+
COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
313+
endif() # CMAKE_HOST_SYSTEM_PROCESSOR
314+
315+
execute_process(
316+
WORKING_DIRECTORY ${TRTLLM_DIR}
317+
COMMAND python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
318+
COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
319+
320+
endif() # TRITON_BUILD
359321

360322
target_link_libraries(
361323
triton-tensorrt-llm-backend
@@ -407,9 +369,14 @@ install(
407369
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
408370

409371
if(TRITON_BUILD)
410-
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/trtllm_build/
372+
file(
373+
GLOB
374+
LIBINFER_PLUGIN_TENSORRT_LLM
375+
"${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so*"
376+
FOLLOW_SYMLINKS)
377+
install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM}
411378
DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
412-
endif()
379+
endif() # TRITON_BUILD
413380

414381
install(
415382
EXPORT triton-tensorrt-llm-backend-targets

tools/environment_setup.sh

Lines changed: 0 additions & 53 deletions
This file was deleted.

0 commit comments

Comments
 (0)