triton-inference-server
diff --git a/‎README.md‎
Lines changed: 31 additions & 17 deletions b/‎README.md‎
Lines changed: 31 additions & 17 deletions
diff --git a/‎ci/L0_backend_trtllm/test.sh‎
Lines changed: 4 additions & 0 deletions b/‎ci/L0_backend_trtllm/test.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎dockerfile/Dockerfile.triton.trt_llm_backend‎
Lines changed: 41 additions & 0 deletions b/‎dockerfile/Dockerfile.triton.trt_llm_backend‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎inflight_batcher_llm/CMakeLists.txt‎
Lines changed: 27 additions & 60 deletions b/‎inflight_batcher_llm/CMakeLists.txt‎
Lines changed: 27 additions & 60 deletions
diff --git a/‎tools/environment_setup.sh‎
Lines changed: 0 additions & 53 deletions b/‎tools/environment_setup.sh‎
Lines changed: 0 additions & 53 deletions
@@ -60,35 +60,49 @@ Starting with Triton 23.10 release, you can follow steps described in the
 [Building With Docker](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/build.md#building-with-docker)
 guide and use the
 [build.py](https://github.com/triton-inference-server/server/blob/main/build.py)
-script.
+script to build the TRT-LLM backend.
 
-A sample command to build a Triton Server container with all options enabled is
-shown below, which will build the same TRT-LLM container as the one on the NGC.
+The below commands will build the same Triton TRT-LLM container as the one on the NGC.
 
 ```bash
-BASE_CONTAINER_IMAGE_NAME=nvcr.io/nvidia/tritonserver:23.10-py3-min
-TENSORRTLLM_BACKEND_REPO_TAG=release/0.5.0
-PYTHON_BACKEND_REPO_TAG=r23.10
-
-# Run the build script. The flags for some features or endpoints can be removed if not needed.
+# Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
+cd tensorrtllm_backend
+# Specify the build args for the dockerfile.
+BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.01-py3-min
+TRT_VERSION=9.2.0.5
+TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.linux.x86_64-gnu.cuda-12.2.tar.gz
+TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.2.0/tensorrt-9.2.0.5.Ubuntu-22.04.aarch64-gnu.cuda-12.2.tar.gz
+
+docker build -t trtllm_base
+             --build-arg BASE_IMAGE="${BASE_IMAGE}"
+             --build-arg TRT_VER="${TRT_VERSION}"
+             --build-arg RELEASE_URL_TRT_x86="${TRT_URL_x86}"
+             --build-arg RELEASE_URL_TRT_ARM="${TRT_URL_ARM}"
+             -f dockerfile/Dockerfile.triton.trt_llm_backend .
+
+# Run the build script from Triton Server repo. The flags for some features or
+# endpoints can be removed if not needed. Please refer to the support matrix to
+# see the aligned versions: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+TRTLLM_BASE_IMAGE=trtllm_base
+TENSORRTLLM_BACKEND_REPO_TAG=v0.7.2
+PYTHON_BACKEND_REPO_TAG=r24.01
+
+cd server
 ./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \
               --enable-metrics --enable-gpu-metrics --enable-cpu-metrics \
               --filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
               --endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
               --backend=ensemble --enable-gpu --endpoint=http --endpoint=grpc \
-              --image=base,${BASE_CONTAINER_IMAGE_NAME} \
+              --image=base,${TRTLLM_BASE_IMAGE} \
               --backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
               --backend=python:${PYTHON_BACKEND_REPO_TAG}
 ```
 
-The `BASE_CONTAINER_IMAGE_NAME` is the base image that will be used to build the
-container. By default it is set to the most recent min image of Triton, on NGC,
-that matches the Triton release you are building for. You can change it to a
-different image if needed by setting the `--image` flag like the command below.
-The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are the tags of
-the TensorRT-LLM backend and Python backend repositories that will be used
-to build the container. You can also remove the features or endpoints that you
-don't need by removing the corresponding flags.
+The `TRTLLM_BASE_IMAGE` is the base image that will be used to build the
+container. The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are
+the tags of the TensorRT-LLM backend and Python backend repositories that will
+be used to build the container. You can also remove the features or endpoints
+that you don't need by removing the corresponding flags.
 
 ### Option 3. Build via Docker
 
 
@@ -39,6 +39,10 @@ CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py
 CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log"
 SERVER_PID=0
 
+# Force environment to use python version 3
+apt update -q=2 \
+    && apt install -y python-is-python3
+
 # Helpers ===============================
 function replace_config_tags {
   tag_to_replace="${1}"
 
@@ -0,0 +1,41 @@
+ARG BASE_IMAGE
+
+FROM ${BASE_IMAGE} as base
+
+RUN apt-get update -q=2 && apt-get install -y --no-install-recommends python3-pip
+# Remove previous TRT installation
+# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
+RUN apt-get remove -y tensorrt*
+RUN pip3 uninstall -y tensorrt
+
+ARG TRT_VER
+
+ENV TRT_VERSION=$TRT_VER \
+    TRT_VER=$TRT_VER \
+    CUDA_VER=$CUDA_VERSION \
+    CUDNN_VER=$CUDNN_VERSION \
+    NCCL_VER=$NCCL_VERSION \
+    CUBLAS_VER=$CUBLAS_VERSION
+
+LABEL TRT_VERSION $TRT_VER
+
+# Download & install internal TRT release
+RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
+    && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
+    && apt install /tmp/cuda-keyring.deb \
+    && rm /tmp/cuda-keyring.deb \
+    && apt-get update -q=2
+
+ARG RELEASE_URL_TRT_x86
+ARG RELEASE_URL_TRT_ARM
+
+RUN [ "$(uname -m)" != "x86_64" ] && RELEASE_URL_TRT=${RELEASE_URL_TRT_ARM} || RELEASE_URL_TRT=${RELEASE_URL_TRT_x86} \
+    && curl -fSL -o /tmp/tensorrt.tar.gz ${RELEASE_URL_TRT} \
+    && tar xzvf /tmp/tensorrt.tar.gz -C /usr/local \
+    && rm /tmp/tensorrt.tar.gz \
+    && find /usr/local -maxdepth 1 -name Tens* -type d -exec ln -s {} /usr/local/tensorrt \;
+
+RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )*
+
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
+ENV TRT_ROOT=/usr/local/tensorrt
@@ -28,20 +28,7 @@ set(TRITON_BUILD
     OFF
     CACHE STRING "Using Triton build process")
 
-if(TRITON_BUILD)
-  set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm)
-  # Install build time dependencies. This section is executed during cmake
-  # configure time.
-  execute_process(
-    COMMAND bash -x ./tools/environment_setup.sh
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-    RESULT_VARIABLE CMD_RESULT)
-  if(NOT CMD_RESULT EQUAL "0")
-    message(FATAL_ERROR "Failed to install build time dependencies")
-  endif()
-else()
-  set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
-endif()
+set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
 
 include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
 
@@ -64,16 +51,6 @@ if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS)
     FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON")
 endif()
 
-# The TRTLLM_BUILD_CONTAINER is used to compile the TRT-LLM libraries that are
-# needed for the TRT-LLM backend. The TRTLLM_BUILD_CONTAINER is launched
-# separately, and the artifacts will be copied back to the backend installation
-# directory.
-if(TRITON_BUILD)
-  set(TRTLLM_BUILD_CONTAINER
-      ""
-      CACHE STRING "Base image for building TRT-LLM")
-endif()
-
 set(TRITON_COMMON_REPO_TAG
     "main"
     CACHE STRING "Tag for triton-inference-server/common repo")
@@ -116,31 +93,6 @@ FetchContent_Declare(
   GIT_SHALLOW ON)
 FetchContent_MakeAvailable(repo-common repo-core repo-backend)
 
-# Compile TRT-LLM
-if(TRITON_BUILD)
-  set(TRITON_TRTLLM_DOCKER_NAME "tritonserver-trtllm")
-  add_custom_command(
-    OUTPUT tensorrt_llm_build
-    COMMENT "Building TensorRT-LLM"
-    COMMAND
-      cd ${CMAKE_CURRENT_SOURCE_DIR} && python3 tools/gen_trtllm_dockerfile.py
-      --trtllm-build-config="${CMAKE_BUILD_TYPE}"
-      --trtllm-base-image="${TRTLLM_BUILD_CONTAINER}" --output=Dockerfile.trtllm
-    COMMAND
-      cd ${CMAKE_CURRENT_SOURCE_DIR} && docker build
-      --cache-from=${TRITON_TRTLLM_DOCKER_NAME}
-      --cache-from=${TRITON_TRTLLM_DOCKER_NAME}_cache0
-      --cache-from=${TRITON_TRTLLM_DOCKER_NAME}_cache1 -t
-      ${TRITON_TRTLLM_DOCKER_NAME} -f ./Dockerfile.trtllm .
-    COMMAND docker rm trtllm_build || echo 'error ignored...' || true
-    COMMAND docker create --name trtllm_build ${TRITON_TRTLLM_DOCKER_NAME}
-    COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && rm -fr tensorrt_llm
-    COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && docker cp
-            trtllm_build:/app/tensorrt_llm tensorrt_llm
-    COMMAND docker cp trtllm_build:/opt/trtllm_lib trtllm_build
-    COMMAND docker rm trtllm_build)
-endif()
-
 #
 # The backend must be built into a shared library. Use an ldscript to hide all
 # symbols except for the TRITONBACKEND API.
@@ -153,11 +105,6 @@ set(SRCS src/libtensorrtllm.cc src/work_item.cc src/work_items_queue.cc
 
 add_library(triton-tensorrt-llm-backend SHARED ${SRCS})
 
-if(TRITON_BUILD)
-  add_custom_target(trtllm_target DEPENDS tensorrt_llm_build)
-  add_dependencies(triton-tensorrt-llm-backend trtllm_target)
-endif()
-
 add_library(TritonTensorRTLLMBackend::triton-tensorrt-llm-backend ALIAS
             triton-tensorrt-llm-backend)
 
@@ -352,10 +299,25 @@ if(TRITON_ENABLE_METRICS)
 endif()
 
 if(TRITON_BUILD)
-  add_dependencies(tensorrt_llm trtllm_target)
-  add_dependencies(tensorrt_llm_batch_manager trtllm_target)
-  add_dependencies(nvinfer_plugin_tensorrt_llm trtllm_target)
-endif()
+
+  if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    execute_process(
+      WORKING_DIRECTORY ${TRTLLM_DIR}
+      COMMAND bash -x docker/common/install_pytorch.sh pypi COMMAND_ECHO STDOUT
+              COMMAND_ERROR_IS_FATAL ANY)
+  else()
+    execute_process(
+      WORKING_DIRECTORY ${TRTLLM_DIR}
+      COMMAND bash -x docker/common/install_pytorch.sh src_non_cxx11_abi
+              COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
+  endif() # CMAKE_HOST_SYSTEM_PROCESSOR
+
+  execute_process(
+    WORKING_DIRECTORY ${TRTLLM_DIR}
+    COMMAND python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
+            COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)
+
+endif() # TRITON_BUILD
 
 target_link_libraries(
   triton-tensorrt-llm-backend
@@ -407,9 +369,14 @@ install(
   RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
 
 if(TRITON_BUILD)
-  install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/trtllm_build/
+  file(
+    GLOB
+    LIBINFER_PLUGIN_TENSORRT_LLM
+    "${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so*"
+    FOLLOW_SYMLINKS)
+  install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM}
           DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
-endif()
+endif() # TRITON_BUILD
 
 install(
   EXPORT triton-tensorrt-llm-backend-targets