From 6cab4bbe14d79d5d4f1cc94d5191dfcc06fb0b5e Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Mon, 1 Apr 2024 13:50:53 -0700 Subject: [PATCH] Update 'main' post-24.03 (#7051) * Update README and versions for 2.44.0 / 24.03 (#6971) * Update README and versions for 2.44.0 / 24.03 * Mchornyi 24.03 (#6972) * Current location is dropped in 12.4 * Update Dockerfile.win10.min * Change to triton_sample_folder (#6973) --------- Co-authored-by: kyle Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> * Specify path for PyTorch model extension library (#7025) * Update README.md 2.44.0 / 24.03 (#7032) * Update README.md post-24.03 --------- Co-authored-by: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com> Co-authored-by: kyle --- Dockerfile.sdk | 2 +- Dockerfile.win10.min | 40 ++++++++++++------- README.md | 8 ++-- build.py | 2 +- deploy/aws/values.yaml | 2 +- deploy/fleetcommand/Chart.yaml | 2 +- deploy/fleetcommand/values.yaml | 6 +-- deploy/gcp/values.yaml | 2 +- .../perf-analyzer-script/triton_client.yaml | 2 +- .../server-deployer/build_and_push.sh | 4 +- .../server-deployer/chart/triton/Chart.yaml | 2 +- .../server-deployer/chart/triton/values.yaml | 6 +-- .../server-deployer/data-test/schema.yaml | 2 +- .../server-deployer/schema.yaml | 4 +- .../gke-marketplace-app/trt-engine/README.md | 6 +-- deploy/k8s-onprem/values.yaml | 2 +- deploy/oci/values.yaml | 2 +- docs/customization_guide/build.md | 6 +-- docs/customization_guide/compose.md | 14 +++---- docs/customization_guide/test.md | 2 +- docs/user_guide/custom_operations.md | 6 +-- docs/user_guide/metrics.md | 2 +- docs/user_guide/performance_tuning.md | 4 +- docs/user_guide/trace.md | 2 +- qa/common/gen_jetson_trt_models | 2 +- qa/common/gen_qa_custom_ops | 6 ++- qa/common/gen_qa_model_repository | 2 +- 27 files changed, 76 insertions(+), 64 deletions(-) diff --git a/Dockerfile.sdk b/Dockerfile.sdk index 2cf297ee5c..7ae8cf0ee8 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -29,7 +29,7 @@ # # Base image on the minimum Triton container -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.02-py3-min +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.03-py3-min ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo ARG TRITON_COMMON_REPO_TAG=main diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index 27d7f7c00f..107b2e8ac0 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -65,18 +65,6 @@ FROM ${BASE_IMAGE} as build_base SHELL ["cmd", "/S", "/C"] -ARG CUDNN_VERSION -ENV CUDNN_VERSION ${CUDNN_VERSION} -COPY --from=dependency_base /cudnn /cudnn -RUN setx PATH "c:\cudnn\bin;c:\cudnn\lib\x64;c:\cudnn\include;%PATH%" -LABEL CUDNN_VERSION="${CUDNN_VERSION}" - -ARG TENSORRT_VERSION -ENV TRT_VERSION ${TENSORRT_VERSION} -COPY --from=dependency_base /TensorRT /TensorRT -RUN setx PATH "c:\TensorRT\lib;%PATH%" -LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" - RUN mkdir c:\tmp WORKDIR /tmp @@ -87,13 +75,13 @@ RUN choco install git docker unzip -y # # Installing python # -ARG PYTHON_VERSION=3.10.11 +ARG PYTHON_VERSION=3.8.10 ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%" RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe" RUN pip install --upgrade wheel setuptools docker -RUN pip install grpcio-tools +RUN pip install grpcio-tools psutil LABEL PYTHON_VERSION=${PYTHON_VERSION} @@ -101,9 +89,17 @@ LABEL PYTHON_VERSION=${PYTHON_VERSION} # Installing CMake # ARG CMAKE_VERSION=3.27.1 -RUN pip install cmake==%CMAKE_VERSION% +ARG CMAKE_FILE=cmake-${CMAKE_VERSION}-windows-x86_64 +ARG CMAKE_SOURCE=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_FILE}.zip + +ADD ${CMAKE_SOURCE} ${CMAKE_FILE}.zip +RUN unzip %CMAKE_FILE%.zip +RUN move %CMAKE_FILE% "c:\CMake" +RUN setx PATH "c:\CMake\bin;%PATH%" + ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake ENV VCPKG_TARGET_TRIPLET x64-windows + LABEL CMAKE_VERSION=${CMAKE_VERSION} # Be aware that pip can interact badly with VS cmd shell so need to pip install before @@ -190,6 +186,20 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" +ARG CUDNN_VERSION +ENV CUDNN_VERSION ${CUDNN_VERSION} +COPY --from=dependency_base /cudnn /cudnn +RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." +RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." +RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." +LABEL CUDNN_VERSION="${CUDNN_VERSION}" + +ARG TENSORRT_VERSION +ENV TRT_VERSION ${TENSORRT_VERSION} +COPY --from=dependency_base /TensorRT /TensorRT +RUN setx PATH "c:\TensorRT\lib;%PATH%" +LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" + LABEL CUDA_VERSION="${CUDA_VERSION}" # It is important that the entrypoint initialize VisualStudio # environment otherwise the build will fail. Also set diff --git a/README.md b/README.md index bb5ecc7465..4783f8f1f7 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Inference Server. > [!WARNING] > ##### LATEST RELEASE > You are currently on the `main` branch which tracks under-development progress towards the next release. -> The current release is version [2.43.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.02 container release on NVIDIA GPU Cloud (NGC). +> The current release is version [2.44.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.03 container release on NVIDIA GPU Cloud (NGC). Triton Inference Server is an open source inference serving software that streamlines AI inferencing. Triton enables teams to deploy any AI model from @@ -104,16 +104,16 @@ Inference Server with the ```bash # Step 1: Create the example model repository -git clone -b r24.02 https://github.com/triton-inference-server/server.git +git clone -b r24.03 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh # Step 2: Launch triton from the NGC Triton container -docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.02-py3 tritonserver --model-repository=/models +docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.03-py3 tritonserver --model-repository=/models # Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container -docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.02-py3-sdk +docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.03-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg # Inference should return the following diff --git a/build.py b/build.py index 283f959001..2dcf84480c 100755 --- a/build.py +++ b/build.py @@ -72,7 +72,7 @@ TRITON_VERSION_MAP = { "2.45.0dev": ( "24.04dev", # triton container - "24.02", # upstream container + "24.03", # upstream container "1.17.2", # ORT "2023.3.0", # ORT OpenVINO "2023.3.0", # Standalone OpenVINO diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml index 16ed8323d7..e915da138b 100644 --- a/deploy/aws/values.yaml +++ b/deploy/aws/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.02-py3 + imageName: nvcr.io/nvidia/tritonserver:24.03-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml index ad83541baf..b7acfe729c 100644 --- a/deploy/fleetcommand/Chart.yaml +++ b/deploy/fleetcommand/Chart.yaml @@ -26,7 +26,7 @@ apiVersion: v1 # appVersion is the Triton version; update when changing release -appVersion: "2.43.0" +appVersion: "2.44.0" description: Triton Inference Server (Fleet Command) name: triton-inference-server # version is the Chart version; update when changing anything in the chart diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml index 655185c6a9..ca00a2acf1 100644 --- a/deploy/fleetcommand/values.yaml +++ b/deploy/fleetcommand/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.02-py3 + imageName: nvcr.io/nvidia/tritonserver:24.03-py3 pullPolicy: IfNotPresent numGpus: 1 serverCommand: tritonserver @@ -46,13 +46,13 @@ image: # Model Control Mode (Optional, default: none) # # To set model control mode, uncomment and configure below - # See https://github.com/triton-inference-server/server/blob/r24.02/docs/model_management.md + # See https://github.com/triton-inference-server/server/blob/r24.03/docs/model_management.md # for more details #- --model-control-mode=explicit|poll|none # # Additional server args # - # see https://github.com/triton-inference-server/server/blob/r24.02/README.md + # see https://github.com/triton-inference-server/server/blob/r24.03/README.md # for more details service: diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml index 264005b539..0173f37b6f 100644 --- a/deploy/gcp/values.yaml +++ b/deploy/gcp/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.02-py3 + imageName: nvcr.io/nvidia/tritonserver:24.03-py3 pullPolicy: IfNotPresent modelRepositoryPath: gs://triton-inference-server-repository/model_repository numGpus: 1 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml index a0d931f42d..7339361528 100644 --- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -33,7 +33,7 @@ metadata: namespace: default spec: containers: - - image: nvcr.io/nvidia/tritonserver:24.02-py3-sdk + - image: nvcr.io/nvidia/tritonserver:24.03-py3-sdk imagePullPolicy: Always name: nv-triton-client securityContext: diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh index 952498c53f..8114dbe6f8 100755 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -28,8 +28,8 @@ export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') export APP_NAME=tritonserver export MAJOR_VERSION=2.41 -export MINOR_VERSION=2.43.0 -export NGC_VERSION=24.02-py3 +export MINOR_VERSION=2.44.0 +export NGC_VERSION=24.03-py3 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml index d973852daf..73590f2ea0 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -28,4 +28,4 @@ apiVersion: v1 appVersion: "2.41" description: Triton Inference Server name: triton-inference-server -version: 2.43.0 +version: 2.44.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index 3890f2b2f2..3e5eac70b5 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -31,14 +31,14 @@ maxReplicaCount: 3 tritonProtocol: HTTP # HPA GPU utilization autoscaling target HPATargetAverageValue: 85 -modelRepositoryPath: gs://triton_sample_models/23_12 -publishedVersion: '2.43.0' +modelRepositoryPath: gs://triton_sample_models/24_03 +publishedVersion: '2.44.0' gcpMarketplace: true image: registry: gcr.io repository: nvidia-ngc-public/tritonserver - tag: 24.02-py3 + tag: 24.03-py3 pullPolicy: IfNotPresent # modify the model repository here to match your GCP storage bucket numGpus: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index 57ec3e892d..9fd8cbe1c4 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.43.0' + publishedVersion: '2.44.0' publishedVersionMetadata: releaseNote: >- Initial release. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index 1af8f82928..0efdef3e72 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -27,7 +27,7 @@ x-google-marketplace: schemaVersion: v2 applicationApiVersion: v1beta1 - publishedVersion: '2.43.0' + publishedVersion: '2.44.0' publishedVersionMetadata: releaseNote: >- Initial release. @@ -89,7 +89,7 @@ properties: modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. - default: gs://triton_sample_models/23_12 + default: gs://triton_sample_models/24_03 image.ldPreloadPath: type: string title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index 8367057a33..fd9ad2e0a5 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,7 +33,7 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:24.02-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.03-py3 pip install onnx six torch tf2onnx tensorflow @@ -57,7 +57,7 @@ mkdir -p engines python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh -gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/23_12/bert/1/model.plan +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24_03/bert/1/model.plan ``` -For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/23_12/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24_03/` should be updated accordingly with the correct version. diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml index 5966f324c7..6bdf2e3cde 100644 --- a/deploy/k8s-onprem/values.yaml +++ b/deploy/k8s-onprem/values.yaml @@ -29,7 +29,7 @@ tags: loadBalancing: true image: - imageName: nvcr.io/nvidia/tritonserver:24.02-py3 + imageName: nvcr.io/nvidia/tritonserver:24.03-py3 pullPolicy: IfNotPresent modelRepositoryServer: < Replace with the IP Address of your file server > modelRepositoryPath: /srv/models diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml index b8db949363..00d66d2594 100644 --- a/deploy/oci/values.yaml +++ b/deploy/oci/values.yaml @@ -27,7 +27,7 @@ replicaCount: 1 image: - imageName: nvcr.io/nvidia/tritonserver:24.02-py3 + imageName: nvcr.io/nvidia/tritonserver:24.03-py3 pullPolicy: IfNotPresent modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository numGpus: 1 diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index 4c1cf44e78..2f8b8f69d4 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common: --repo-tag=core:` will default to the branch name. For example, if you are building on the -r24.02 branch, `` will default to r24.02. If you are +r24.03 branch, `` will default to r24.03. If you are building on any other branch (including the *main* branch) then `` will default to "main". Therefore, you typically do not need to provide `` at all (nor the preceding @@ -334,8 +334,8 @@ python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild If you are building on *main* branch then '' will default to "main". If you are building on a release branch then '' will default to the branch name. For example, if you -are building on the r24.02 branch, '' will default to -r24.02. Therefore, you typically do not need to provide '' will default to +r24.03. Therefore, you typically do not need to provide '' at all (nor the preceding colon). You can use a different '' for a component to instead use the corresponding branch/tag in the build. For example, if you have a branch called diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md index 859ce91eba..1bb475db32 100644 --- a/docs/customization_guide/compose.md +++ b/docs/customization_guide/compose.md @@ -44,8 +44,8 @@ from source to get more exact customization. The `compose.py` script can be found in the [server repository](https://github.com/triton-inference-server/server). Simply clone the repository and run `compose.py` to create a custom container. Note: Created container version will depend on the branch that was cloned. -For example branch [r24.02](https://github.com/triton-inference-server/server/tree/r24.02) -should be used to create a image based on the NGC 24.02 Triton release. +For example branch [r24.03](https://github.com/triton-inference-server/server/tree/r24.03) +should be used to create a image based on the NGC 24.03 Triton release. `compose.py` provides `--backend`, `--repoagent` options that allow you to specify which backends and repository agents to include in the custom image. @@ -76,19 +76,19 @@ For example, running ``` python3 compose.py --backend tensorflow1 --repoagent checksum ``` -on branch [r24.02](https://github.com/triton-inference-server/server/tree/r24.02) pulls: -- `min` container `nvcr.io/nvidia/tritonserver:24.02-py3-min` -- `full` container `nvcr.io/nvidia/tritonserver:24.02-py3` +on branch [r24.03](https://github.com/triton-inference-server/server/tree/r24.03) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.03-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.03-py3` Alternatively, users can specify the version of Triton container to pull from any branch by either: 1. Adding flag `--container-version ` to branch ``` -python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 24.02 +python3 compose.py --backend tensorflow1 --repoagent checksum --container-version 24.03 ``` 2. Specifying `--image min, --image full,`. The user is responsible for specifying compatible `min` and `full` containers. ``` -python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.02-py3-min --image full,nvcr.io/nvidia/tritonserver:24.02-py3 +python3 compose.py --backend tensorflow1 --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.03-py3-min --image full,nvcr.io/nvidia/tritonserver:24.03-py3 ``` Method 1 and 2 will result in the same composed container. Furthermore, `--image` flag overrides the `--container-version` flag when both are specified. diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md index baa2676b6a..a64d81a27f 100644 --- a/docs/customization_guide/test.md +++ b/docs/customization_guide/test.md @@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops ``` This will create multiple model repositories in /tmp//qa_* -(for example /tmp/24.02/qa_model_repository). The TensorRT models +(for example /tmp/24.03/qa_model_repository). The TensorRT models will be created for the GPU on the system that CUDA considers device 0 (zero). If you have multiple GPUs on your system see the documentation in the scripts for how to target a specific GPU. diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index c82760544e..17d0470c47 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is to use the [NGC TensorRT container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) corresponding to the Triton container. For example, if you are using -the 24.02 version of Triton, use the 24.02 version of the TensorRT +the 24.03 version of Triton, use the 24.03 version of the TensorRT container. ## TensorFlow @@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow is to use the [NGC TensorFlow container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) corresponding to the Triton container. For example, if you are using -the 24.02 version of Triton, use the 24.02 version of the TensorFlow +the 24.03 version of Triton, use the 24.03 version of the TensorFlow container. ## PyTorch @@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is to use the [NGC PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) corresponding to the Triton container. For example, if you are using -the 24.02 version of Triton, use the 24.02 version of the PyTorch +the 24.03 version of Triton, use the 24.03 version of the PyTorch container. ## ONNX diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 6386f17ee6..9bb4cc5337 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -285,7 +285,7 @@ If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can ## Pinned Memory Metrics -Starting in 24.02, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool. +Starting in 24.03, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool. |Category |Metric |Metric Name |Description |Granularity|Frequency | |----------------|------------------|----------------------------|-------------------------------------------------------|-----------|-------------| diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index b118eb3953..3317e4dd5d 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -235,7 +235,7 @@ with a `tritonserver` binary. ```bash # Start server container -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.02-py3 +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.03-py3 # Start serving your models tritonserver --model-repository=/mnt/models @@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u ```bash # Start the SDK container interactively -docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.02-py3-sdk +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.03-py3-sdk # Benchmark model being served from step 3 perf_analyzer -m densenet_onnx --concurrency-range 1:4 diff --git a/docs/user_guide/trace.md b/docs/user_guide/trace.md index 0571867124..5c18f2dda3 100644 --- a/docs/user_guide/trace.md +++ b/docs/user_guide/trace.md @@ -595,7 +595,7 @@ The following table shows available OpenTelemetry trace APIs settings for ### OpenTelemetry Context Propagation Triton supports [context propagation](https://opentelemetry.io/docs/concepts/context-propagation/) -in OpenTelemetry mode starting in version 24.02. Note, that every request +in OpenTelemetry mode starting in version 24.03. Note, that every request with propagated OpenTelemetry context will be traced, regardless of `rate` and `count` trace settings. If a user wishes to trace only those requests, for which OpenTelemetry context was injected on the client side, please start Triton with diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 7d1416598a..a5fbc6a51e 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -34,7 +34,7 @@ # Make all generated files accessible outside of container umask 0000 # Set the version of the models -TRITON_VERSION=${TRITON_VERSION:=24.02} +TRITON_VERSION=${TRITON_VERSION:=24.03} # Set the CUDA device to use CUDA_DEVICE=${RUNNER_ID:=0} # Set TensorRT image diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops index d13cff36af..13a67c2578 100755 --- a/qa/common/gen_qa_custom_ops +++ b/qa/common/gen_qa_custom_ops @@ -37,7 +37,7 @@ ## ############################################################################ -TRITON_VERSION=${TRITON_VERSION:=24.02} +TRITON_VERSION=${TRITON_VERSION:=24.03} NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION} TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3} PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3} @@ -124,13 +124,15 @@ if [ $? -ne 0 ]; then fi # PyTorch + cat >$HOST_SRCDIR/$PYTSCRIPT <