Merge branch 'main' of https://github.com/triton-inference-server/server

into spolisetty_dlis_7657
triton-inference-server · Jan 13, 2025 · 32490e1 · 32490e1
2 parents 5a9b11f + 65ef9c8
commit 32490e1
Show file tree

Hide file tree

Showing 62 changed files with 916 additions and 272 deletions.
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.11-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.12-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo

diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
@@ -37,9 +37,9 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_VERSION=10.7.0.23
 ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip"
-ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip
+ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/zip/TensorRT-10.7.0.23.Windows.win10.cuda-12.6.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
 ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
 RUN unzip /tmp/%TENSORRT_ZIP%
@@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.4.0.58
+ARG CUDNN_VERSION=9.6.0.74
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip
+ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.6.0.74_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
@@ -75,20 +75,19 @@ RUN choco install git docker unzip -y
 #
 # Installing python
 #
-ARG PYTHON_VERSION=3.10.11
+ARG PYTHON_VERSION=3.12.3
 ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe
 ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe
 RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%"
 RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe"
 RUN pip install --upgrade wheel setuptools docker
-RUN pip install grpcio-tools psutil
 
 LABEL PYTHON_VERSION=${PYTHON_VERSION}
 
 #
 # Installing CMake
 #
-ARG CMAKE_VERSION=3.30.0
+ARG CMAKE_VERSION=3.30.5
 RUN pip install cmake==%CMAKE_VERSION%
 
 ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake
@@ -101,14 +100,16 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION}
 #
 # Installing Visual Studio BuildTools: VS17 2022
 #
-ARG BUILDTOOLS_VERSION=17.10.35201.131
 # Download collect.exe in case of an install failure.
 ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe"
 
 # Use the latest release channel. For more control, specify the location of an internal layout.
 # Download the Build Tools bootstrapper.
 # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe
-ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe
+
+ARG BUILDTOOLS_VERSION=17.12.35506.116
+ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5536698c-711c-4834-876f-2817d31a2ef2/58894fc272e86d3c3a6d85bf3a1df1e5a0685be8b9ab65d9f3cc5c2a8c6921cc/vs_BuildTools.exe
+
 ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe
 # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended.
 ARG VS_INSTALL_PATH_WP="C:\BuildTools"
@@ -149,12 +150,13 @@ WORKDIR /
 # Installing CUDA
 #
 ARG CUDA_MAJOR=12
-ARG CUDA_MINOR=5
-ARG CUDA_PATCH=1
+ARG CUDA_MINOR=6
+ARG CUDA_PATCH=3
 ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}
 ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cudart_${CUDA_MAJOR}.${CUDA_MINOR} \
                    nvml_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+                   nvrtc_${CUDA_MAJOR}.${CUDA_MINOR} nvrtc_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cublas_${CUDA_MAJOR}.${CUDA_MINOR} cublas_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cufft_${CUDA_MAJOR}.${CUDA_MINOR} cufft_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
                    curand_${CUDA_MAJOR}.${CUDA_MINOR} curand_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
@@ -175,21 +177,23 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.4.0.58
+ENV CUDA_VERSION=${CUDA_VERSION}
+LABEL CUDA_VERSION="${CUDA_VERSION}"
+
+ARG CUDNN_VERSION=9.6.0.74
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
 RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_VERSION=10.7.0.23
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"
 LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 
-LABEL CUDA_VERSION="${CUDA_VERSION}"
 # It is important that the entrypoint initialize VisualStudio
 # environment otherwise the build will fail. Also set
 # CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so

diff --git a/README.md b/README.md
@@ -32,8 +32,8 @@
 
 >[!WARNING]
 >You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.52.0](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 24.11 container release on NVIDIA GPU Cloud (NGC).
+>towards the next release. The current release is version [2.53.0](https://github.com/triton-inference-server/server/releases/latest)
+>and corresponds to the 24.12 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -91,16 +91,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.11 https://github.com/triton-inference-server/server.git
+git clone -b r24.12 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.11-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.12-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.11-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.12-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following

diff --git a/TRITON_VERSION b/TRITON_VERSION
@@ -1 +1 @@
-2.53.0dev
+2.54.0dev
diff --git a/build.py b/build.py
@@ -71,10 +71,10 @@
 #
 
 DEFAULT_TRITON_VERSION_MAP = {
-    "release_version": "2.53.0dev",
-    "triton_container_version": "24.12dev",
-    "upstream_container_version": "24.11",
-    "ort_version": "1.19.2",
+    "release_version": "2.54.0dev",
+    "triton_container_version": "24.01dev",
+    "upstream_container_version": "24.12",
+    "ort_version": "1.20.1",
     "ort_openvino_version": "2024.4.0",
     "standalone_openvino_version": "2024.4.0",
     "dcgm_version": "3.3.6",
@@ -1238,6 +1238,8 @@ def create_dockerfile_linux(
     find /opt/tritonserver/python -maxdepth 1 -type f -name \\
     "tritonfrontend-*.whl" | xargs -I {} pip install --upgrade {}[all]
 
+RUN pip3 install -r python/openai/requirements.txt
+
 """
     if not FLAGS.no_core_build:
         # Add feature labels for SageMaker endpoint
@@ -1934,6 +1936,10 @@ def core_build(
         os.path.join(install_dir, "include", "triton", "core"),
     )
 
+    cmake_script.cpdir(
+        os.path.join(repo_dir, "python", "openai"), os.path.join(install_dir, "python")
+    )
+
     cmake_script.cp(os.path.join(repo_dir, "LICENSE"), install_dir)
     cmake_script.cp(os.path.join(repo_dir, "TRITON_VERSION"), install_dir)
 

diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.11-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1

diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.51.0"
+appVersion: "2.53.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart

diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.11-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r24.11/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r24.12/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.11/README.md
+    # see https://github.com/triton-inference-server/server/blob/r24.12/README.md
     #  for more details
 
 service:

diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.11-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1

diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.11-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:24.12-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:

diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -27,9 +27,9 @@
 
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
-export MAJOR_VERSION=2.51
-export MINOR_VERSION=2.51.0
-export NGC_VERSION=24.11-py3
+export MAJOR_VERSION=2.53
+export MINOR_VERSION=2.53.0
+export NGC_VERSION=24.12-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 

diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 apiVersion: v1
-appVersion: "2.51"
+appVersion: "2.53"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.51.0
+version: 2.53.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/24.11
-publishedVersion: '2.51.0'
+modelRepositoryPath: gs://triton_sample_models/24.12
+publishedVersion: '2.53.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 24.11-py3
+  tag: 24.12-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1

diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.51.0'
+  publishedVersion: '2.53.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.

diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.51.0'
+  publishedVersion: '2.53.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/24.11
+    default: gs://triton_sample_models/24.12
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.

diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.11-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.12-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.11/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.12/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.11/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.12/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
@@ -29,7 +29,7 @@ tags:
   loadBalancing: true
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.11-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models

diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.11-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1

diff --git a/docs/backend_guide/vllm.rst b/docs/backend_guide/vllm.rst
@@ -1,3 +1,30 @@
+..
+.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+..
+.. Redistribution and use in source and binary forms, with or without
+.. modification, are permitted provided that the following conditions
+.. are met:
+..  * Redistributions of source code must retain the above copyright
+..    notice, this list of conditions and the following disclaimer.
+..  * Redistributions in binary form must reproduce the above copyright
+..    notice, this list of conditions and the following disclaimer in the
+..    documentation and/or other materials provided with the distribution.
+..  * Neither the name of NVIDIA CORPORATION nor the names of its
+..    contributors may be used to endorse or promote products derived
+..    from this software without specific prior written permission.
+..
+.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.. PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 ########
 vLLM
 ########