diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 3019114930..19d0afe5f7 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.11-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.12-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index dec972eaf3..9147d70718 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -37,9 +37,9 @@ RUN choco install unzip -y # # Installing TensorRT # -ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_VERSION=10.7.0.23 ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip" -ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip +ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/zip/TensorRT-10.7.0.23.Windows.win10.cuda-12.6.zip # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} RUN unzip /tmp/%TENSORRT_ZIP% @@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" # # Installing cuDNN # -ARG CUDNN_VERSION=9.4.0.58 +ARG CUDNN_VERSION=9.6.0.74 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip -ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip +ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.6.0.74_cuda12-archive.zip ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} RUN unzip /tmp/%CUDNN_ZIP% RUN move cudnn-* cudnn @@ -75,20 +75,19 @@ RUN choco install git docker unzip -y # # Installing python # -ARG PYTHON_VERSION=3.10.11 +ARG PYTHON_VERSION=3.12.3 ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%" RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe" RUN pip install --upgrade wheel setuptools docker -RUN pip install grpcio-tools psutil LABEL PYTHON_VERSION=${PYTHON_VERSION} # # Installing CMake # -ARG CMAKE_VERSION=3.30.0 +ARG CMAKE_VERSION=3.30.5 RUN pip install cmake==%CMAKE_VERSION% ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake @@ -101,14 +100,16 @@ LABEL CMAKE_VERSION=${CMAKE_VERSION} # # Installing Visual Studio BuildTools: VS17 2022 # -ARG BUILDTOOLS_VERSION=17.10.35201.131 # Download collect.exe in case of an install failure. ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe" # Use the latest release channel. For more control, specify the location of an internal layout. # Download the Build Tools bootstrapper. # ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe -ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe + +ARG BUILDTOOLS_VERSION=17.12.35506.116 +ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/5536698c-711c-4834-876f-2817d31a2ef2/58894fc272e86d3c3a6d85bf3a1df1e5a0685be8b9ab65d9f3cc5c2a8c6921cc/vs_BuildTools.exe + ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe # Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended. ARG VS_INSTALL_PATH_WP="C:\BuildTools" @@ -149,12 +150,13 @@ WORKDIR / # Installing CUDA # ARG CUDA_MAJOR=12 -ARG CUDA_MINOR=5 -ARG CUDA_PATCH=1 +ARG CUDA_MINOR=6 +ARG CUDA_PATCH=3 ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH} ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \ cudart_${CUDA_MAJOR}.${CUDA_MINOR} \ nvml_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + nvrtc_${CUDA_MAJOR}.${CUDA_MINOR} nvrtc_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ cublas_${CUDA_MAJOR}.${CUDA_MINOR} cublas_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ cufft_${CUDA_MAJOR}.${CUDA_MINOR} cufft_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ curand_${CUDA_MAJOR}.${CUDA_MINOR} curand_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ @@ -175,7 +177,10 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" -ARG CUDNN_VERSION=9.4.0.58 +ENV CUDA_VERSION=${CUDA_VERSION} +LABEL CUDA_VERSION="${CUDA_VERSION}" + +ARG CUDNN_VERSION=9.6.0.74 ENV CUDNN_VERSION ${CUDNN_VERSION} COPY --from=dependency_base /cudnn /cudnn RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." @@ -183,13 +188,12 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." LABEL CUDNN_VERSION="${CUDNN_VERSION}" -ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_VERSION=10.7.0.23 ENV TRT_VERSION ${TENSORRT_VERSION} COPY --from=dependency_base /TensorRT /TensorRT RUN setx PATH "c:\TensorRT\lib;%PATH%" LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" -LABEL CUDA_VERSION="${CUDA_VERSION}" # It is important that the entrypoint initialize VisualStudio # environment otherwise the build will fail. Also set # CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so diff --git a/README.md b/README.md index 79e572b97d..b9c076cc60 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ >[!WARNING] >You are currently on the `main` branch which tracks under-development progress ->towards the next release. The current release is version [2.52.0](https://github.com/triton-inference-server/server/releases/latest) ->and corresponds to the 24.11 container release on NVIDIA GPU Cloud (NGC). +>towards the next release. The current release is version [2.53.0](https://github.com/triton-inference-server/server/releases/latest) +>and corresponds to the 24.12 container release on NVIDIA GPU Cloud (NGC). Triton Inference Server is an open source inference serving software that streamlines AI inferencing. Triton enables teams to deploy any AI model from @@ -91,16 +91,16 @@ Inference Server with the ```bash # Step 1: Create the example model repository -git clone -b r24.11 https://github.com/triton-inference-server/server.git +git clone -b r24.12 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh # Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.11-py3 tritonserver --model-repository=/models +docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.12-py3 tritonserver --model-repository=/models # Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.11-py3-sdk +docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.12-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg # Inference should return the following diff --git a/TRITON_VERSION b/TRITON_VERSION index 7eb4ffb28a..fd02e9348b 100644 --- a/TRITON_VERSION +++ b/TRITON_VERSION @@ -1 +1 @@ -2.53.0dev +2.54.0dev diff --git a/build.py b/build.py index 8b5cb32f98..1cd03f5e99 100755 --- a/build.py +++ b/build.py @@ -71,10 +71,10 @@ # DEFAULT_TRITON_VERSION_MAP = { - "release_version": "2.53.0dev", - "triton_container_version": "24.12dev", - "upstream_container_version": "24.11", - "ort_version": "1.19.2", + "release_version": "2.54.0dev", + "triton_container_version": "24.01dev", + "upstream_container_version": "24.12", + "ort_version": "1.20.1", "ort_openvino_version": "2024.4.0", "standalone_openvino_version": "2024.4.0", "dcgm_version": "3.3.6", @@ -1238,6 +1238,8 @@ def create_dockerfile_linux( find /opt/tritonserver/python -maxdepth 1 -type f -name \\ "tritonfrontend-*.whl" | xargs -I {} pip install --upgrade {}[all] +RUN pip3 install -r python/openai/requirements.txt + """ if not FLAGS.no_core_build: # Add feature labels for SageMaker endpoint @@ -1934,6 +1936,10 @@ def core_build( os.path.join(install_dir, "include", "triton", "core"), ) + cmake_script.cpdir( + os.path.join(repo_dir, "python", "openai"), os.path.join(install_dir, "python") + ) + cmake_script.cp(os.path.join(repo_dir, "LICENSE"), install_dir) cmake_script.cp(os.path.join(repo_dir, "TRITON_VERSION"), install_dir) diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index 4fcdd14bdb..be118becce 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index 4e3c87c387..aac221acd0 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.51.0" +appVersion: "2.53.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index ff5513c7d7..30b1c331d1 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -47,13 +47,13 @@ image: # # To set model control mode, uncomment and configure below # TODO: Fix the following url, it is invalid - # See https://github.com/triton-inference-server/server/blob/r24.11/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.12/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.11/README.md + # see https://github.com/triton-inference-server/server/blob/r24.12/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index f79cb75134..7a27c61efa 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index c27a327e2f..6712d7d381 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:24.11-py3-sdk + - image: nvcr.io/nvidia/tritonserver:24.12-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index a0c9762865..8c26ee5ed0 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -27,9 +27,9 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver -export MAJOR_VERSION=2.51 -export MINOR_VERSION=2.51.0 -export NGC_VERSION=24.11-py3 +export MAJOR_VERSION=2.53 +export MINOR_VERSION=2.53.0 +export NGC_VERSION=24.12-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index 027deb1d2f..7ad8ba851b 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. apiVersion: v1 -appVersion: "2.51" +appVersion: "2.53" description: Triton Inference Server name: triton-inference-server -version: 2.51.0 +version: 2.53.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index dfb992a543..673ec6acb3 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -31,14 +31,14 @@ maxReplicaCount: 3 tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 -modelRepositoryPath: gs://triton_sample_models/24.11 -publishedVersion: '2.51.0' +modelRepositoryPath: gs://triton_sample_models/24.12 +publishedVersion: '2.53.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 24.11-py3 + tag: 24.12-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index be46874dba..eefb209efb 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.51.0' + publishedVersion: '2.53.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index 699fa04a68..1defe7ca42 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.51.0' + publishedVersion: '2.53.0' publishedVersionMetadata: releaseNote: >- Initial release. @@ -89,7 +89,7 @@ properties: modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. - default: gs://triton_sample_models/24.11 + default: gs://triton_sample_models/24.12 image.ldPreloadPath: type: string title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index 6a16fc9523..bdf655b2b0 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:24.11-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.12-py3 pip install onnx six torch tf2onnx tensorflow @@ -57,7 +57,7 @@ mkdir -p engines python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh -gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.11/bert/1/model.plan +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.12/bert/1/model.plan ``` -For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.11/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.12/` should be updated accordingly with the correct version. diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index 77f1b47c5b..8e2fdcda6d 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -29,7 +29,7 @@ tags: loadBalancing: true image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index 1a62e52e7a..716ac24400 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.11-py3 + imageName: nvcr.io/nvidia/tritonserver:24.12-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/backend_guide/vllm.rst b/docs/backend_guide/vllm.rst index 06be17128f..d28f2af5ab 100644 --- a/docs/backend_guide/vllm.rst +++ b/docs/backend_guide/vllm.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######## vLLM ######## diff --git a/docs/client_guide/api_reference.rst b/docs/client_guide/api_reference.rst index 0493510e71..f626c1ac9b 100644 --- a/docs/client_guide/api_reference.rst +++ b/docs/client_guide/api_reference.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### API Reference #### diff --git a/docs/client_guide/in_process.rst b/docs/client_guide/in_process.rst index b1ee46a925..56ab778440 100644 --- a/docs/client_guide/in_process.rst +++ b/docs/client_guide/in_process.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### In-Process Triton Server API #### diff --git a/docs/client_guide/kserve.rst b/docs/client_guide/kserve.rst index e2ac33c45f..310435fc4e 100644 --- a/docs/client_guide/kserve.rst +++ b/docs/client_guide/kserve.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### KServe API #### diff --git a/docs/client_guide/kserve_extension.rst b/docs/client_guide/kserve_extension.rst index 7a78484499..dde6c4062b 100644 --- a/docs/client_guide/kserve_extension.rst +++ b/docs/client_guide/kserve_extension.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### Extensions #### diff --git a/docs/client_guide/python.rst b/docs/client_guide/python.rst index 2610ce2d87..545f4f6042 100644 --- a/docs/client_guide/python.rst +++ b/docs/client_guide/python.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### Python #### diff --git a/docs/client_guide/python_readme.rst b/docs/client_guide/python_readme.rst index 91e3f1b26d..e7a79abe60 100644 --- a/docs/client_guide/python_readme.rst +++ b/docs/client_guide/python_readme.rst @@ -1,32 +1,32 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + .. raw:: html - Triton Inference Server In-Process Python API [BETA] ==================================================== diff --git a/docs/conf.py b/docs/conf.py index 6c59e45c72..0b44f7c8b2 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -177,7 +177,7 @@ "switcher": { # use for local testing # "json_url": "http://localhost:8000/_static/switcher.json", - "json_url": "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/_static/switcher.json", + "json_url": "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/_static/switcher.json", "version_match": one_before if "dev" in version_long else version_short, }, "navbar_start": ["navbar-logo", "version-switcher"], diff --git a/docs/contents.rst b/docs/contents.rst index ff132c729d..555c433d85 100644 --- a/docs/contents.rst +++ b/docs/contents.rst @@ -1,27 +1,29 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .. toctree:: :hidden: diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index 0622414609..fcb4ce14e9 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common: --repo-tag=core:` will default to the branch name. For example, if you are building on the -r24.11 branch, `` will default to r24.11. If you are +r24.12 branch, `` will default to r24.12. If you are building on any other branch (including the *main* branch) then `` will default to "main". Therefore, you typically do not need to provide `` at all (nor the preceding @@ -334,8 +334,8 @@ python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild If you are building on *main* branch then `` will default to "main". If you are building on a release branch then `` will default to the branch name. For example, if you -are building on the r24.11 branch, `` will default to -r24.11. Therefore, you typically do not need to provide `` will default to +r24.12. Therefore, you typically do not need to provide `` at all (nor the preceding colon). You can use a different `` for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index 8bddd46aeb..9f20a05347 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -46,8 +46,8 @@ The `compose.py` script can be found in the Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. For example branch - [r24.11](https://github.com/triton-inference-server/server/tree/r24.11) -should be used to create a image based on the NGC 24.11 Triton release. + [r24.12](https://github.com/triton-inference-server/server/tree/r24.12) +should be used to create a image based on the NGC 24.12 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -79,20 +79,20 @@ For example, running ``` python3 compose.py --backend pytorch --repoagent checksum ``` -on branch [r24.11](https://github.com/triton-inference-server/server/tree/r24.11) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:24.11-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:24.11-py3` +on branch [r24.12](https://github.com/triton-inference-server/server/tree/r24.12) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.12-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.12-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend pytorch --repoagent checksum --container-version 24.11 +python3 compose.py --backend pytorch --repoagent checksum --container-version 24.12 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.11-py3-min --image full,nvcr.io/nvidia/tritonserver:24.11-py3 +python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.12-py3-min --image full,nvcr.io/nvidia/tritonserver:24.12-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. @@ -103,8 +103,8 @@ Note: 2. vLLM and TensorRT-LLM backends are currently not supported backends for `compose.py`. If you want to build additional backends on top of these backends, it would be better to [build it yourself](#build-it-yourself) by using -`nvcr.io/nvidia/tritonserver:24.11-vllm-python-py3` or -`nvcr.io/nvidia/tritonserver:24.11-trtllm-python-py3` as a `min` container. +`nvcr.io/nvidia/tritonserver:24.12-vllm-python-py3` or +`nvcr.io/nvidia/tritonserver:24.12-trtllm-python-py3` as a `min` container. ### CPU-only container composition diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index 39891b3177..a85a10f48b 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops ``` This will create multiple model repositories in /tmp/\/qa_* -(for example /tmp/24.11/qa_model_repository). The TensorRT models +(for example /tmp/24.12/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation in the scripts for how to target a specific GPU. diff --git a/docs/generate_docs.py b/docs/generate_docs.py index 065c14de1e..048bd77035 100755 --- a/docs/generate_docs.py +++ b/docs/generate_docs.py @@ -41,11 +41,11 @@ """ TODO: Needs to handle cross-branch linkage. -For example, server/docs/user_guide/architecture.md on branch 24.11 links to +For example, server/docs/user_guide/architecture.md on branch 24.12 links to server/docs/user_guide/model_analyzer.md on main branch. In this case, the hyperlink of model_analyzer.md should be a URL instead of relative path. -Another example can be server/docs/user_guide/model_analyzer.md on branch 24.11 +Another example can be server/docs/user_guide/model_analyzer.md on branch 24.12 links to a file in server repo with relative path. Currently all URLs are hardcoded to main branch. We need to make sure that the URL actually points to the correct branch. We also need to handle cases like deprecated or removed files from diff --git a/docs/getting_started/llm.md b/docs/getting_started/llm.md index b5f738c3d5..cecf565f51 100644 --- a/docs/getting_started/llm.md +++ b/docs/getting_started/llm.md @@ -1,3 +1,31 @@ + + # Deploying Phi-3 Model with Triton and TRT-LLM This guide captures the steps to build Phi-3 with TRT-LLM and deploy with Triton Inference Server. It also shows a shows how to use GenAI-Perf to run benchmarks to measure model performance in terms of throughput and latency. @@ -326,7 +354,7 @@ All config files inside /tensorrtllm\_backend/all\_models/inflight\_batcher\_llm
ensemble/config.pbtxt - # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -836,7 +864,7 @@ All config files inside /tensorrtllm\_backend/all\_models/inflight\_batcher\_llm
postprocessing/config.pbtxt - # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -965,7 +993,7 @@ All config files inside /tensorrtllm\_backend/all\_models/inflight\_batcher\_llm
preprocessing/config.pbtxt - # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -1160,7 +1188,7 @@ All config files inside /tensorrtllm\_backend/all\_models/inflight\_batcher\_llm tensorrt_llm/config.pbtxt - # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/docs/getting_started/quick_deployment_by_backend.rst b/docs/getting_started/quick_deployment_by_backend.rst index c8e461c00c..aefa56787b 100644 --- a/docs/getting_started/quick_deployment_by_backend.rst +++ b/docs/getting_started/quick_deployment_by_backend.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### Quick Deployment Guide by backend #### diff --git a/docs/getting_started/quick_start.rst b/docs/getting_started/quick_start.rst index 8af21534a3..27f100e3cd 100644 --- a/docs/getting_started/quick_start.rst +++ b/docs/getting_started/quick_start.rst @@ -1,32 +1,32 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + .. raw:: html - Quickstart ========== diff --git a/docs/introduction/compatibility.md b/docs/introduction/compatibility.md index da72b5c2e8..d15a866a86 100644 --- a/docs/introduction/compatibility.md +++ b/docs/introduction/compatibility.md @@ -37,7 +37,9 @@ | Triton release version | NGC Tag | Python version | Torch version | TensorRT version | TensorRT-LLM version | CUDA version | CUDA Driver version | Size | | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| 24.10 | nvcr.io/nvidia/tritonserver:24.10-trtllm-python-py3 | Python 3.10.12 | 2.4.0a0%2B3bcc3cddb5.nv24.7 | 10.4.0 | 0.14.0 | 12.5.1.007 | 555.42.06 | 21G | +| 24.12 | nvcr.io/nvidia/tritonserver:24.12-trtllm-python-py3 | Python 3.12.3 | 2.6.0a0%2Bdf5bbc09d1.nv24.11 | 10.7.0 | 0.16.0 | 12.6.3 | 560.35.05 | 22G | +| 24.11 | nvcr.io/nvidia/tritonserver:24.11-trtllm-python-py3 | Python 3.10.12 | 2.5.0a0%2Be000cf0ad9.nv24.10 | 10.6.0 | 0.15.0 | 12.6.3 | 555.42.06 | 24.8G | +| 24.10 | nvcr.io/nvidia/tritonserver:24.10-trtllm-python-py3 | Python 3.10.12 | 2.4.0a0%2B3bcc3cddb5.nv24.7 | 10.4.0 | 0.14.0 | 12.5.1.007 | 555.42.06 | 23.3G | | 24.09 | nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3 | Python 3.10.12 | 2.4.0a0%2B3bcc3cddb5.nv24.7 | 10.4.0 | 0.13.0 | 12.5.1.007 | 555.42.06 | 21G | | 24.08 | nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 | Python 3.10.12 | 2.4.0a0%2B3bcc3cddb5.nv24.7 | 10.3.0 | 0.12.0 | 12.5.1.007 | 555.42.06 | 21G | | 24.07 | nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 | Python 3.10.12 | 2.4.0a0%2B07cecf4168.nv24.5 | 10.1.0 | 0.11.0 | 12.4.1.003 | 550.54.15 | 23G | @@ -49,7 +51,9 @@ | Triton release version | NGC Tag | Python version | vLLM version | CUDA version | CUDA Driver version | Size | | --- | --- | --- | --- | --- | --- | --- | -| 24.10 | nvcr.io/nvidia/tritonserver:24.10-vllm-python-py3 | Python 3.10.12 | 0.5.5 | 12.6.2.004 | 560.35.03 | 19G | +| 24.12 | nvcr.io/nvidia/tritonserver:24.12-vllm-python-py3 | Python 3.12.3 | 0.5.5 | 12.6.3.004 | 560.35.05 | 20G | +| 24.11 | nvcr.io/nvidia/tritonserver:24.11-vllm-python-py3 | Python 3.12.3 | 0.5.5 | 12.6.3.001 | 560.35.05 | 22.1G | +| 24.10 | nvcr.io/nvidia/tritonserver:24.10-vllm-python-py3 | Python 3.10.12 | 0.5.5 | 12.6.2.004 | 560.35.03 | 21G | | 24.09 | nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3 | Python 3.10.12 | 0.5.3.post1 | 12.6.1.006 | 560.35.03 | 19G | | 24.08 | nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3 | Python 3.10.12 | 0.5.0 post1 | 12.6.0.022 | 560.35.03 | 19G | | 24.07 | nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 | Python 3.10.12 | 0.5.0 post1 | 12.5.1 | 555.42.06 | 19G | @@ -61,6 +65,8 @@ | Triton release version | ONNX Runtime | | --- | --- | +| 24.12 | 1.20.1 | +| 24.11 | 1.19.2 | | 24.10 | 1.19.2 | | 24.09 | 1.19.2 | | 24.08 | 1.18.1 | diff --git a/docs/introduction/index.md b/docs/introduction/index.md index 306c2082e7..4ac740f36b 100644 --- a/docs/introduction/index.md +++ b/docs/introduction/index.md @@ -54,11 +54,11 @@ the development and deployment of production AI. ## Triton Architecture The following figure shows the Triton Inference Server high-level -architecture. The [model repository](user_guide/model_repository.md) is a +architecture. The [model repository](../user_guide/model_repository.md) is a file-system based repository of the models that Triton will make available for inferencing. Inference requests arrive at the server via -either [HTTP/REST or GRPC](customization_guide/inference_protocols.md) or by the [C -API](customization_guide/inference_protocols.md) and are then routed to the appropriate per-model +either [HTTP/REST or GRPC](../customization_guide/inference_protocols.md) or by the [C +API](../customization_guide/inprocess_c_api.md) and are then routed to the appropriate per-model scheduler. Triton implements [multiple scheduling and batching algorithms](#models-and-schedulers) that can be configured on a model-by-model basis. Each model's scheduler optionally performs @@ -75,7 +75,7 @@ custom pre- and post-processing operations or even a new deep-learning framework. The models being served by Triton can be queried and controlled by a -dedicated [model management API](user_guide/model_management.md) that is +dedicated [model management API](../user_guide/model_management.md) that is available by HTTP/REST or GRPC protocol, or by the C API. Readiness and liveness health endpoints and utilization, throughput @@ -93,25 +93,25 @@ Major features include: - [Supports multiple machine learning frameworks](https://github.com/triton-inference-server/fil_backend) - [Concurrent model - execution](user_guide/model_execution.md#concurrent-model-execution) -- [Dynamic batching](user_guide/batcher.md#dynamic-batcher) -- [Sequence batching](user_guide/batcher.md#sequence-batcher) and - [implicit state management](user_guide/implicit_state_management.md#implicit-state-management) + execution](../user_guide/model_execution.md#concurrent-model-execution) +- [Dynamic batching](../user_guide/batcher.md#dynamic-batcher) +- [Sequence batching](../user_guide/batcher.md#sequence-batcher) and + [implicit state management](../user_guide/implicit_state_management.md#implicit-state-management) for stateful models - Provides [Backend API](https://github.com/triton-inference-server/backend) that allows adding custom backends and pre/post processing operations - Model pipelines using - [Ensembling](user_guide/ensemble_models.md#ensemble-models) or [Business + [Ensembling](../user_guide/ensemble_models.md#ensemble-models) or [Business Logic Scripting - (BLS)](user_guide/bls.md#business-logic-scripting) + (BLS)](../user_guide/bls.md#business-logic-scripting) - [HTTP/REST and GRPC inference - protocols](customization_guide/inference_protocols.md) based on the community + protocols](../customization_guide/inference_protocols.md) based on the community developed [KServe protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) -- A [C API](customization_guide/inprocess_c_api.md) and - [Java API](customization_guide/inprocess_java_api.md) +- A [C API](../customization_guide/inprocess_c_api.md) and + [Java API](../customization_guide/inprocess_java_api.md) allow Triton to link directly into your application for edge and other in-process use cases -- [Metrics](user_guide/metrics.md) indicating GPU utilization, server +- [Metrics](../user_guide/metrics.md) indicating GPU utilization, server throughput, server latency, and more Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best diff --git a/docs/introduction/release_notes.md b/docs/introduction/release_notes.md index 63f72e0c15..1901985a7e 100644 --- a/docs/introduction/release_notes.md +++ b/docs/introduction/release_notes.md @@ -25,9 +25,9 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# [Triton Inference Server Release 24.10](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-24-10.html#rel-24-10) +# [Triton Inference Server Release 24.12](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-24-12.html#rel-24-12) -The Triton Inference Server container image, release 24.10, is available on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver) and is open source on [GitHub](https://github.com/triton-inference-server/server). +The Triton Inference Server container image, release 24.12, is available on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver) and is open source on [GitHub](https://github.com/triton-inference-server/server). ## **Contents of the Triton Inference Server container** @@ -38,60 +38,63 @@ For a complete list of what the container includes, refer to [Deep Learning Fram The container also includes the following: -- [Ubuntu 22.04](http://releases.ubuntu.com/22.04/) including [Python 3.10](https://www.python.org/downloads/release/python-3100/) +- [Ubuntu 24.04](http://releases.ubuntu.com/24.04/) including [Python 3.12](https://www.python.org/downloads/release/python-3120/) -- [NVIDIA CUDA 12.6.2](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html) +- [NVIDIA CUDA 12.6.3](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html) -- [NVIDIA cuBLAS 12.6.3.3](https://docs.nvidia.com/cuda/cublas/index.html) +- [NVIDIA cuBLAS 12.6.4.1](https://docs.nvidia.com/cuda/cublas/index.html) -- [cuDNN 9.5.0.50](https://docs.nvidia.com/deeplearning/cudnn/release-notes/) +- [cuDNN 9.6.0.74](https://docs.nvidia.com/deeplearning/cudnn/release-notes/) -- [NVIDIA NCCL 2.22.3](https://docs.nvidia.com/deeplearning/nccl/release-notes/) (optimized for [NVIDIA NVLink](http://www.nvidia.com/object/nvlink.html)®) +- [NVIDIA NCCL 2.23.4](https://docs.nvidia.com/deeplearning/nccl/release-notes/) (optimized for [NVIDIA NVLink](http://www.nvidia.com/object/nvlink.html)®) -- [NVIDIA TensorRT™ 10.5.0.18](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) +- [NVIDIA TensorRT™ 10.7.0.23](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) - OpenUCX 1.15.0 - GDRCopy 2.3 -- NVIDIA HPC-X 2.20 +- NVIDIA HPC-X 2.21 - OpenMPI 4.1.7 - [FIL](https://github.com/triton-inference-server/fil_backend) -- [NVIDIA DALI® 1.42](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html) +- [NVIDIA DALI® 1.44](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html) - [nvImageCodec 0.2.0.7](https://docs.nvidia.com/cuda/nvimagecodec/release_notes_v0.2.0.html) - ONNX Runtime 1.19.2 -- Intel[ OpenVINO ](https://github.com/openvinotoolkit/openvino/tree/2022.1.0)2024.0.0 +- Intel[ OpenVINO ](https://github.com/openvinotoolkit/openvino/tree/2022.1.0)2024.40.0 - DCGM 3.2.6 -- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/) version [release/0.13.0](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.13.0) +- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/) version [release/0.15.0](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.15.0) -- [vLLM](https://github.com/vllm-project/vllm) version 0.5.3 post 1 +- [vLLM](https://github.com/vllm-project/vllm) version 0.5.5 ## **Driver Requirements** -Release 24.10 is based on [CUDA 12.6.2](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html) which requires [NVIDIA Driver](http://www.nvidia.com/Download/index.aspx?lang=en-us) release 560 or later. However, if you are running on a data center GPU (for example, T4 or any other data center GPU), you can use NVIDIA driver release 470.57 (or later R470), 525.85 (or later R525), 535.86 (or later R535), or 545.23 (or later R545). +Release 24.12 is based on [CUDA 12.6.3](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html) which requires [NVIDIA Driver](http://www.nvidia.com/Download/index.aspx?lang=en-us) release 560 or later. However, if you are running on a data center GPU (for example, T4 or any other data center GPU), you can use NVIDIA driver release 470.57 (or later R470), 525.85 (or later R525), 535.86 (or later R535), or 545.23 (or later R545). The CUDA driver's compatibility package only supports particular drivers. Thus, users should upgrade from all R418, R440, R450, R460, R510, R520, R530, R545 and R555 drivers, which are not forward-compatible with CUDA 12.6. For a complete list of supported drivers, see the [CUDA Application Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#use-the-right-compat-package) topic. For more information, see [CUDA Compatibility and Upgrades](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#cuda-compatibility-and-upgrades). ## **GPU Requirements** -Release 24.10 supports CUDA compute capability 6.0 and later. This corresponds to GPUs in the NVIDIA Pascal, NVIDIA Volta™, NVIDIA Turing™, NVIDIA Ampere architecture, NVIDIA Hopper™, and NVIDIA Ada Lovelace architecture families. For a list of GPUs to which this compute capability corresponds, see [CUDA GPUs](https://developer.nvidia.com/cuda-gpus). For additional support details, see [Deep Learning Frameworks Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html). +Release 24.12 supports CUDA compute capability 6.0 and later. This corresponds to GPUs in the NVIDIA Pascal, NVIDIA Volta™, NVIDIA Turing™, NVIDIA Ampere architecture, NVIDIA Hopper™, and NVIDIA Ada Lovelace architecture families. For a list of GPUs to which this compute capability corresponds, see [CUDA GPUs](https://developer.nvidia.com/cuda-gpus). For additional support details, see [Deep Learning Frameworks Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html). ## **Key Features and Enhancements** This Inference Server release includes the following key features and enhancements. -- Optimized vLLM performance with custom metrics. +- [vLLM backend health check](https://github.com/triton-inference-server/vllm_backend/blob/r24.12/docs/health_check.md) may be optionally enabled which unloads the model if the vLLM engine health check failed. +- vLLM backend supports sending [additional outputs](https://github.com/triton-inference-server/vllm_backend/blob/r24.12/docs/additional_outputs.md) from vLLM if requested. +- Improved server stability during the gRPC client cancellation. + ## **Known Issues** - Numpy 2.x is not currently supported for Python Backend models and may cause them to return empty tensors unxpectedly, please use Numpy 1.x until support is added. @@ -121,4 +124,5 @@ This Inference Server release includes the following key features and enhancemen - GPU tensors - CPU and GPU-related metrics - Custom execution environments - - The model load/unload APIs \ No newline at end of file + - The model load/unload APIs +- The latest GenAI-Perf package on pypi.org is version 0.0.9dev while the latest Triton SDK container (24.12) contains GenAI-Perf version 0.0.8. diff --git a/docs/perf_benchmark/genai-perf-README.rst b/docs/perf_benchmark/genai-perf-README.rst index ea6a2d0d01..c4a3c7d73d 100644 --- a/docs/perf_benchmark/genai-perf-README.rst +++ b/docs/perf_benchmark/genai-perf-README.rst @@ -1,32 +1,32 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + .. raw:: html - GenAI-Perf ========== diff --git a/docs/perf_benchmark/genai_perf.rst b/docs/perf_benchmark/genai_perf.rst index d621431061..175662477f 100644 --- a/docs/perf_benchmark/genai_perf.rst +++ b/docs/perf_benchmark/genai_perf.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### GenAI Performance Analyzer #### diff --git a/docs/perf_benchmark/model-analyzer-README.rst b/docs/perf_benchmark/model-analyzer-README.rst index 1c31a578ff..f31e7ca633 100644 --- a/docs/perf_benchmark/model-analyzer-README.rst +++ b/docs/perf_benchmark/model-analyzer-README.rst @@ -1,20 +1,32 @@ -.. raw:: html - - |License| diff --git a/docs/perf_benchmark/model_analyzer.rst b/docs/perf_benchmark/model_analyzer.rst index d66005c336..c29a96aa92 100644 --- a/docs/perf_benchmark/model_analyzer.rst +++ b/docs/perf_benchmark/model_analyzer.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### Model Analyzer #### diff --git a/docs/perf_benchmark/perf-analyzer-README.rst b/docs/perf_benchmark/perf-analyzer-README.rst index f51d19deb9..4f678cfdba 100644 --- a/docs/perf_benchmark/perf-analyzer-README.rst +++ b/docs/perf_benchmark/perf-analyzer-README.rst @@ -1,32 +1,32 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + .. raw:: html - Triton Performance Analyzer =========================== diff --git a/docs/perf_benchmark/perf_analyzer.rst b/docs/perf_benchmark/perf_analyzer.rst index 0aa5172c88..d6c6156a62 100644 --- a/docs/perf_benchmark/perf_analyzer.rst +++ b/docs/perf_benchmark/perf_analyzer.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + #### Performance Analyzer #### diff --git a/docs/scaling_guide/scaling_guide.rst b/docs/scaling_guide/scaling_guide.rst index f4d252f77e..57b4486ff0 100644 --- a/docs/scaling_guide/scaling_guide.rst +++ b/docs/scaling_guide/scaling_guide.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######## Scaling guide ######## diff --git a/docs/server_guide/features.rst b/docs/server_guide/features.rst index a14fa711c2..9a44645e3e 100644 --- a/docs/server_guide/features.rst +++ b/docs/server_guide/features.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######## Features ######## diff --git a/docs/server_guide/model_pipelines.rst b/docs/server_guide/model_pipelines.rst index 5f4dcffaaa..e12225f40d 100644 --- a/docs/server_guide/model_pipelines.rst +++ b/docs/server_guide/model_pipelines.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######## Model Pipelines ######## diff --git a/docs/server_guide/state_management.rst b/docs/server_guide/state_management.rst index 75f6b44b23..284bb4b5fe 100644 --- a/docs/server_guide/state_management.rst +++ b/docs/server_guide/state_management.rst @@ -1,3 +1,30 @@ +.. +.. Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. +.. Redistribution and use in source and binary forms, with or without +.. modification, are permitted provided that the following conditions +.. are met: +.. * Redistributions of source code must retain the above copyright +.. notice, this list of conditions and the following disclaimer. +.. * Redistributions in binary form must reproduce the above copyright +.. notice, this list of conditions and the following disclaimer in the +.. documentation and/or other materials provided with the distribution. +.. * Neither the name of NVIDIA CORPORATION nor the names of its +.. contributors may be used to endorse or promote products derived +.. from this software without specific prior written permission. +.. +.. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +.. EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +.. CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +.. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +.. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +.. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +.. OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######## State Management ######## diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 3787a89a60..faf66de25c 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.11 version of Triton, use the 24.11 version of the TensorRT +the 24.12 version of Triton, use the 24.12 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.11 version of Triton, use the 24.11 version of the TensorFlow +the 24.12 version of Triton, use the 24.12 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.11 version of Triton, use the 24.11 version of the PyTorch +the 24.12 version of Triton, use the 24.12 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index ff21175bbe..4d1f067662 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.11-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.12-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.11-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.12-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/python/openai/openai_frontend/frontend/fastapi/__init__.py b/python/openai/openai_frontend/frontend/fastapi/__init__.py new file mode 100644 index 0000000000..f3dec540e2 --- /dev/null +++ b/python/openai/openai_frontend/frontend/fastapi/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai_frontend/frontend/fastapi/__init__py b/python/openai/openai_frontend/frontend/fastapi/__init__py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/python/openai/requirements.txt b/python/openai/requirements.txt index 46807fcc9c..0d3fdbb8c1 100644 --- a/python/openai/requirements.txt +++ b/python/openai/requirements.txt @@ -25,8 +25,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # FastAPI Application -fastapi==0.111.1 +fastapi==0.115.6 # Fix httpx version to avoid bug in openai library: # https://community.openai.com/t/error-with-openai-1-56-0-client-init-got-an-unexpected-keyword-argument-proxies/1040332/3 httpx==0.27.2 openai==1.40.6 +# Minimum starlette version needed to address CVE: +# https://github.com/advisories/GHSA-f96h-pmfr-66vw +starlette>=0.40.0 diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index c910c204ac..2bff43fafe 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -29,11 +29,12 @@ function install_deps() { # Install python bindings for tritonserver and tritonfrontend - pip install /opt/tritonserver/python/triton*.whl + # pip install /opt/tritonserver/python/triton*.whl # Install application/testing requirements pushd openai/ - pip install -r requirements.txt + # NOTE: Should be pre-installed in container, but can uncomment if needed + # pip install -r requirements.txt pip install -r requirements-test.txt if [ "${IMAGE_KIND}" == "TRTLLM" ]; then @@ -49,13 +50,17 @@ function prepare_vllm() { } function prepare_tensorrtllm() { + # FIXME: Remove when testing TRT-LLM containers built from source + pip install -r requirements.txt + MODEL="llama-3-8b-instruct" MODEL_REPO="tests/tensorrtllm_models" rm -rf ${MODEL_REPO} - # FIXME: This will require an upgrade each release to match the TRT-LLM version + # FIXME: This may require an upgrade each release to match the TRT-LLM version, + # and would likely be easier to use trtllm-build directly for test purposes. # Use Triton CLI to prepare model repository for testing - pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.10 + pip install git+https://github.com/triton-inference-server/triton_cli.git@0.1.1 # NOTE: Could use ENGINE_DEST_PATH set to NFS mount for pre-built engines in future triton import \ --model ${MODEL} \ diff --git a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py index 15f16da352..c9883c9133 100755 --- a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py +++ b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -38,6 +38,8 @@ import numpy as np import sequence_util as su import test_util as tu +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException, np_to_triton_dtype _test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) _test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) @@ -77,6 +79,12 @@ def get_expected_result(self, expected_result, corrid, value, trial, flag_str=No expected_result += corrid return expected_result + def data_type_to_string(self, dtype): + if dtype == "TYPE_STRING": + return "BYTES" + else: + return dtype.replace("TYPE_", "") + def test_skip_batch(self): # Test model instances together are configured with # total-batch-size 4. Send four sequences in parallel where @@ -221,6 +229,78 @@ def test_skip_batch(self): self.cleanup_shm_regions(precreated_shm2_handles) self.cleanup_shm_regions(precreated_shm3_handles) + def test_corrid_data_type(self): + model_name = "add_sub" + expected_corrid_dtype = os.environ["TRITONSERVER_CORRID_DATA_TYPE"] + + for corrid, corrid_dtype in [("corrid", "TYPE_STRING"), (123, "TYPE_UINT64")]: + # Check if the corrid data type matches the expected corrid data type specified in the model config + dtypes_match = True + if (corrid_dtype == "TYPE_STRING") and ( + expected_corrid_dtype != "TYPE_STRING" + ): + dtypes_match = False + elif (corrid_dtype == "TYPE_UINT64") and ( + expected_corrid_dtype + not in ["TYPE_UINT32", "TYPE_INT32", "TYPE_UINT64", "TYPE_INT64"] + ): + dtypes_match = False + + with httpclient.InferenceServerClient("localhost:8000") as client: + input0_data = np.random.rand(16).astype(np.float32) + input1_data = np.random.rand(16).astype(np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", + input0_data.shape, + np_to_triton_dtype(input0_data.dtype), + ), + httpclient.InferInput( + "INPUT1", + input1_data.shape, + np_to_triton_dtype(input1_data.dtype), + ), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + if not dtypes_match: + with self.assertRaises(InferenceServerException) as e: + client.infer( + model_name, + inputs, + sequence_id=corrid, + sequence_start=True, + sequence_end=False, + ) + err_str = str(e.exception) + self.assertIn( + f"sequence batching control 'CORRID' data-type is '{self.data_type_to_string(corrid_dtype)}', but model '{model_name}' expects '{self.data_type_to_string(expected_corrid_dtype)}'", + err_str, + ) + else: + response = client.infer( + model_name, + inputs, + sequence_id=corrid, + sequence_start=True, + sequence_end=False, + ) + response.get_response() + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + + self.assertTrue( + np.allclose(input0_data + input1_data, output0_data), + "add_sub example error: incorrect sum", + ) + + self.assertTrue( + np.allclose(input0_data - input1_data, output1_data), + "add_sub example error: incorrect difference", + ) + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_sequence_corrid_batcher/test.sh b/qa/L0_sequence_corrid_batcher/test.sh index 8d114a395a..3948cd7445 100755 --- a/qa/L0_sequence_corrid_batcher/test.sh +++ b/qa/L0_sequence_corrid_batcher/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -121,6 +121,59 @@ for model_trial in 4; do done done +# Test correlation ID data type +mkdir -p corrid_data_type/add_sub/1 +cp ../python_models/add_sub/model.py corrid_data_type/add_sub/1 + +for corrid_data_type in TYPE_STRING TYPE_UINT32 TYPE_INT32 TYPE_UINT64 TYPE_INT64; do + (cd corrid_data_type/add_sub && \ + cp ../../../python_models/add_sub/config.pbtxt . && \ + echo "sequence_batching { \ + control_input [{ \ + name: \"CORRID\" \ + control [{ \ + kind: CONTROL_SEQUENCE_CORRID \ + data_type: $corrid_data_type \ + }] + }] \ + }" >> config.pbtxt) + MODEL_DIR=corrid_data_type + + for i in test_corrid_data_type ; do + export TRITONSERVER_CORRID_DATA_TYPE=$corrid_data_type + SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR" + SERVER_LOG="./$i.$MODEL_DIR.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, repository $MODEL_DIR" >>$CLIENT_LOG + + set +e + python $BATCHER_TEST SequenceCorrIDBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_CORRID_DATA_TYPE + kill $SERVER_PID + wait $SERVER_PID + done +done + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/qa/common/check_copyright.py b/qa/common/check_copyright.py index 7f44426d45..95694dc460 100755 --- a/qa/common/check_copyright.py +++ b/qa/common/check_copyright.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -60,6 +60,8 @@ "docs/_static/.gitattributes", "docs/examples/model_repository", "docs/examples/jetson", + "docs/repositories.txt", + "docs/exclusions.txt", "docker", "qa/common/cuda_op_kernel.cu.cc.patch", "qa/ensemble_models/mix_platform_float32_float32_float32/output0_labels.txt", @@ -191,6 +193,8 @@ def visit(path): prefix = "# " elif line.startswith("// "): prefix = "// " + elif line.startswith(".. "): + prefix = ".. " elif not line.startswith(COPYRIGHT_YEAR_RE[0]): print( "incorrect prefix for copyright line, allowed prefixes '# ' or '// ', for " diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 65ea20296f..70fdf9031f 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.11} +TRITON_VERSION=${TRITON_VERSION:=24.12} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index 78de379e63..65a498bbdf 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,7 +37,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.11} +TRITON_VERSION=${TRITON_VERSION:=24.12} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} @@ -156,7 +156,7 @@ echo -e "\033[34m[ INFO ] - Running: $TFSCRIPT \033[0m " docker run \ --rm \ - --label RUNNER_ID=$$RUNNER_ID \ + --label RUNNER_ID=$RUNNER_ID \ --label PROJECT_NAME=$PROJECT_NAME \ $DOCKER_GPU_ARGS \ -v $DOCKER_VOLUME:/mnt \ diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index c4bd68753c..93e4dc2dfd 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -48,7 +48,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.11} +TRITON_VERSION=${TRITON_VERSION:=24.12} # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version ONNX_VERSION=1.16.1 @@ -286,8 +286,8 @@ python3 $VOLUME_SRCDIR/gen_qa_dyna_sequence_implicit_models.py --onnx --onnx_ops chmod -R 777 $VOLUME_DYNASEQIMPLICITDESTDIR python3 $VOLUME_SRCDIR/gen_qa_ragged_models.py --onnx --onnx_opset=$ONNX_OPSET --models_dir=$VOLUME_RAGGEDDESTDIR chmod -R 777 $VOLUME_RAGGEDDESTDIR -python3 $VOLUME_SRCDIR/gen_qa_ort_scalar_models.py --onnx_opset=$ONNX_OPSET --models_dir=$SCALARMODELSDESTDIR -chmod -R 777 $VOLUME_RAGGEDDESTDIR +python3 $VOLUME_SRCDIR/gen_qa_ort_scalar_models.py --onnx_opset=$ONNX_OPSET --models_dir=$VOLUME_SCALARMODELSDESTDIR +chmod -R 777 $VOLUME_SCALARMODELSDESTDIR EOF chmod a+x $ONNXSCRIPT diff --git a/qa/common/gen_qa_ort_scalar_models.py b/qa/common/gen_qa_ort_scalar_models.py index f2ddb35912..c00a97d5ed 100755 --- a/qa/common/gen_qa_ort_scalar_models.py +++ b/qa/common/gen_qa_ort_scalar_models.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -124,6 +124,10 @@ def create_onnx_modelconfig(models_dir, dtype, shape): ) FLAGS = parser.parse_args() + + if not FLAGS.models_dir: + raise Exception("--models_dir is required") + create_onnx_modelfile(FLAGS.models_dir, shape=[1], dtype=np.float32) create_onnx_modelconfig(FLAGS.models_dir, shape=[1], dtype=np.float32) create_onnx_modelfile(FLAGS.models_dir, shape=[1, 1], dtype=np.float32) diff --git a/qa/common/gen_qa_trt_plugin_models.py b/qa/common/gen_qa_trt_plugin_models.py index c8c01c1b8b..0e2e9cf698 100755 --- a/qa/common/gen_qa_trt_plugin_models.py +++ b/qa/common/gen_qa_trt_plugin_models.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import ctypes import os import numpy as np @@ -38,13 +39,13 @@ TRT_LOGGER = trt.Logger() trt.init_libnvinfer_plugins(TRT_LOGGER, "") -PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list def get_trt_plugin(plugin_name): plugin = None field_collection = None - for plugin_creator in PLUGIN_CREATORS: + plugin_creators = trt.get_plugin_registry().plugin_creator_list + for plugin_creator in plugin_creators: if (plugin_creator.name == "CustomHardmax") and ( plugin_name == "CustomHardmax" ): @@ -272,13 +273,37 @@ def create_plugin_models(models_dir): ) +def windows_load_plugin_lib(win_plugin_dll): + if os.path.isfile(win_plugin_dll): + try: + ctypes.CDLL(win_plugin_dll, winmode=0) + except TypeError: + # winmode only introduced in python 3.8 + ctypes.CDLL(win_plugin_dll) + return + + raise IOError('Failed to load library: "{}".'.format(win_plugin_dll)) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--models_dir", type=str, required=True, help="Top-level model directory" ) + parser.add_argument( + "--win_plugin_dll", + type=str, + required=False, + default="", + help="Path to Windows plugin .dll", + ) FLAGS, unparsed = parser.parse_known_args() import test_util as tu + # Linux can leverage LD_PRELOAD. We must load the Windows plugin manually + # in order for it to be discovered in the registry. + if os.name == "nt": + windows_load_plugin_lib(FLAGS.win_plugin_dll) + create_plugin_models(FLAGS.models_dir) diff --git a/tools/add_copyright.py b/tools/add_copyright.py index 7a3d0ac216..a51ffbfc64 100644 --- a/tools/add_copyright.py +++ b/tools/add_copyright.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -246,6 +246,11 @@ def html_md(path): update_or_add_header(path, "") +@register(has_ext([".rst"])) +def rst(path): + update_or_add_header(path, prefix_lines(LICENSE_TEXT, ".. ")) + + def add_copyrights(paths): for path in paths: for match, handler in FILE_TYPE_HANDLERS.items():